mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-14 16:36:57 +00:00
Compare commits
1 Commits
swedish-do
...
refactor-s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9874efc352 |
2
.github/workflows/test-suite.yml
vendored
2
.github/workflows/test-suite.yml
vendored
@@ -116,7 +116,7 @@ jobs:
|
|||||||
override: true
|
override: true
|
||||||
- name: Run cargo tree without default features and check lindera is not present
|
- name: Run cargo tree without default features and check lindera is not present
|
||||||
run: |
|
run: |
|
||||||
if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -qz lindera; then
|
if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -vqz lindera; then
|
||||||
echo "lindera has been found in the sources and it shouldn't"
|
echo "lindera has been found in the sources and it shouldn't"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|||||||
30
Cargo.lock
generated
30
Cargo.lock
generated
@@ -381,9 +381,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arroy"
|
name = "arroy"
|
||||||
version = "0.4.0"
|
version = "0.3.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2ece9e5347e7fdaaea3181dec7f916677ad5f3fcbac183648ce1924eb4aeef9a"
|
checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -679,9 +679,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytemuck"
|
name = "bytemuck"
|
||||||
version = "1.16.1"
|
version = "1.15.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
|
checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck_derive",
|
"bytemuck_derive",
|
||||||
]
|
]
|
||||||
@@ -2273,9 +2273,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heed"
|
name = "heed"
|
||||||
version = "0.20.2"
|
version = "0.20.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f60d7cff16094be9627830b399c087a25017e93fb3768b87cd656a68ccb1ebe8"
|
checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.5.0",
|
"bitflags 2.5.0",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -2455,7 +2455,6 @@ name = "index-scheduler"
|
|||||||
version = "1.9.0"
|
version = "1.9.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"arroy",
|
|
||||||
"big_s",
|
"big_s",
|
||||||
"bincode",
|
"bincode",
|
||||||
"crossbeam",
|
"crossbeam",
|
||||||
@@ -2466,7 +2465,6 @@ dependencies = [
|
|||||||
"file-store",
|
"file-store",
|
||||||
"flate2",
|
"flate2",
|
||||||
"insta",
|
"insta",
|
||||||
"maplit",
|
|
||||||
"meili-snap",
|
"meili-snap",
|
||||||
"meilisearch-auth",
|
"meilisearch-auth",
|
||||||
"meilisearch-types",
|
"meilisearch-types",
|
||||||
@@ -3172,9 +3170,9 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lmdb-master-sys"
|
name = "lmdb-master-sys"
|
||||||
version = "0.2.1"
|
version = "0.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a5142795c220effa4c8f4813537bd4c88113a07e45e93100ccb2adc5cec6c7f3"
|
checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
"doxygen-rs",
|
"doxygen-rs",
|
||||||
@@ -5053,18 +5051,18 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
version = "1.0.61"
|
version = "1.0.58"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
|
checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"thiserror-impl",
|
"thiserror-impl",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror-impl"
|
name = "thiserror-impl"
|
||||||
version = "1.0.61"
|
version = "1.0.58"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
|
checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
@@ -5303,9 +5301,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tracing-actix-web"
|
name = "tracing-actix-web"
|
||||||
version = "0.7.11"
|
version = "0.7.10"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19"
|
checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"mutually_exclusive_features",
|
"mutually_exclusive_features",
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ RUN set -eux; \
|
|||||||
if [ "$apkArch" = "aarch64" ]; then \
|
if [ "$apkArch" = "aarch64" ]; then \
|
||||||
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
|
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
|
||||||
fi && \
|
fi && \
|
||||||
cargo build --release -p meilisearch -p meilitool --features "swedish-recomposition"
|
cargo build --release -p meilisearch -p meilitool
|
||||||
|
|
||||||
# Run
|
# Run
|
||||||
FROM alpine:3.16
|
FROM alpine:3.16
|
||||||
|
|||||||
@@ -780,7 +780,7 @@ expression: document
|
|||||||
1.3484878540039063
|
1.3484878540039063
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
"regenerate": true
|
"userProvided": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -779,7 +779,7 @@ expression: document
|
|||||||
1.04031240940094
|
1.04031240940094
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
"regenerate": true
|
"userProvided": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,7 +152,6 @@ impl Settings<Unchecked> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
|
|
||||||
#[cfg_attr(test, derive(serde::Serialize))]
|
#[cfg_attr(test, derive(serde::Serialize))]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
|
|||||||
@@ -182,7 +182,6 @@ impl Settings<Unchecked> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
#[cfg_attr(test, derive(serde::Serialize))]
|
#[cfg_attr(test, derive(serde::Serialize))]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
|
|||||||
@@ -200,7 +200,6 @@ impl std::ops::Deref for IndexUid {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
#[cfg_attr(test, derive(serde::Serialize))]
|
#[cfg_attr(test, derive(serde::Serialize))]
|
||||||
#[cfg_attr(test, serde(rename_all = "camelCase"))]
|
#[cfg_attr(test, serde(rename_all = "camelCase"))]
|
||||||
|
|||||||
Binary file not shown.
@@ -40,9 +40,7 @@ ureq = "2.9.7"
|
|||||||
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
arroy = "0.4.0"
|
|
||||||
big_s = "1.0.2"
|
big_s = "1.0.2"
|
||||||
crossbeam = "0.8.4"
|
crossbeam = "0.8.4"
|
||||||
insta = { version = "1.34.0", features = ["json", "redactions"] }
|
insta = { version = "1.34.0", features = ["json", "redactions"] }
|
||||||
maplit = "1.0.2"
|
|
||||||
meili-snap = { path = "../meili-snap" }
|
meili-snap = { path = "../meili-snap" }
|
||||||
|
|||||||
@@ -909,7 +909,6 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||||
let embedding_configs = index.embedding_configs(&rtxn)?;
|
|
||||||
|
|
||||||
// 3.1. Dump the documents
|
// 3.1. Dump the documents
|
||||||
for ret in index.all_documents(&rtxn)? {
|
for ret in index.all_documents(&rtxn)? {
|
||||||
@@ -952,21 +951,16 @@ impl IndexScheduler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
for (embedder_name, embeddings) in embeddings {
|
for (embedder_name, embeddings) in embeddings {
|
||||||
let user_provided = embedding_configs
|
// don't change the entry if it already exists, because it was user-provided
|
||||||
.iter()
|
vectors.entry(embedder_name).or_insert_with(|| {
|
||||||
.find(|conf| conf.name == embedder_name)
|
let embeddings = ExplicitVectors {
|
||||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
embeddings: VectorOrArrayOfVectors::from_array_of_vectors(
|
||||||
|
embeddings,
|
||||||
let embeddings = ExplicitVectors {
|
),
|
||||||
embeddings: Some(
|
user_provided: false,
|
||||||
VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
|
};
|
||||||
),
|
serde_json::to_value(embeddings).unwrap()
|
||||||
regenerate: !user_provided,
|
});
|
||||||
};
|
|
||||||
vectors.insert(
|
|
||||||
embedder_name,
|
|
||||||
serde_json::to_value(embeddings).unwrap(),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -53,7 +53,6 @@ use meilisearch_types::heed::byteorder::BE;
|
|||||||
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128};
|
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128};
|
||||||
use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn};
|
use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn};
|
||||||
use meilisearch_types::milli::documents::DocumentsBatchBuilder;
|
use meilisearch_types::milli::documents::DocumentsBatchBuilder;
|
||||||
use meilisearch_types::milli::index::IndexEmbeddingConfig;
|
|
||||||
use meilisearch_types::milli::update::IndexerConfig;
|
use meilisearch_types::milli::update::IndexerConfig;
|
||||||
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
|
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
|
||||||
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
||||||
@@ -1460,39 +1459,33 @@ impl IndexScheduler {
|
|||||||
// TODO: consider using a type alias or a struct embedder/template
|
// TODO: consider using a type alias or a struct embedder/template
|
||||||
pub fn embedders(
|
pub fn embedders(
|
||||||
&self,
|
&self,
|
||||||
embedding_configs: Vec<IndexEmbeddingConfig>,
|
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>,
|
||||||
) -> Result<EmbeddingConfigs> {
|
) -> Result<EmbeddingConfigs> {
|
||||||
let res: Result<_> = embedding_configs
|
let res: Result<_> = embedding_configs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(
|
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| {
|
||||||
|IndexEmbeddingConfig {
|
let prompt =
|
||||||
name,
|
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
|
||||||
config: milli::vector::EmbeddingConfig { embedder_options, prompt },
|
// optimistically return existing embedder
|
||||||
..
|
{
|
||||||
}| {
|
let embedders = self.embedders.read().unwrap();
|
||||||
let prompt =
|
if let Some(embedder) = embedders.get(&embedder_options) {
|
||||||
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
|
return Ok((name, (embedder.clone(), prompt)));
|
||||||
// optimistically return existing embedder
|
|
||||||
{
|
|
||||||
let embedders = self.embedders.read().unwrap();
|
|
||||||
if let Some(embedder) = embedders.get(&embedder_options) {
|
|
||||||
return Ok((name, (embedder.clone(), prompt)));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// add missing embedder
|
// add missing embedder
|
||||||
let embedder = Arc::new(
|
let embedder = Arc::new(
|
||||||
Embedder::new(embedder_options.clone())
|
Embedder::new(embedder_options.clone())
|
||||||
.map_err(meilisearch_types::milli::vector::Error::from)
|
.map_err(meilisearch_types::milli::vector::Error::from)
|
||||||
.map_err(meilisearch_types::milli::Error::from)?,
|
.map_err(meilisearch_types::milli::Error::from)?,
|
||||||
);
|
);
|
||||||
{
|
{
|
||||||
let mut embedders = self.embedders.write().unwrap();
|
let mut embedders = self.embedders.write().unwrap();
|
||||||
embedders.insert(embedder_options, embedder.clone());
|
embedders.insert(embedder_options, embedder.clone());
|
||||||
}
|
}
|
||||||
Ok((name, (embedder, prompt)))
|
Ok((name, (embedder, prompt)))
|
||||||
},
|
})
|
||||||
)
|
|
||||||
.collect();
|
.collect();
|
||||||
res.map(EmbeddingConfigs::new)
|
res.map(EmbeddingConfigs::new)
|
||||||
}
|
}
|
||||||
@@ -1755,9 +1748,6 @@ mod tests {
|
|||||||
use meilisearch_types::milli::update::IndexDocumentsMethod::{
|
use meilisearch_types::milli::update::IndexDocumentsMethod::{
|
||||||
ReplaceDocuments, UpdateDocuments,
|
ReplaceDocuments, UpdateDocuments,
|
||||||
};
|
};
|
||||||
use meilisearch_types::milli::update::Setting;
|
|
||||||
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
|
|
||||||
use meilisearch_types::settings::Unchecked;
|
|
||||||
use meilisearch_types::tasks::IndexSwap;
|
use meilisearch_types::tasks::IndexSwap;
|
||||||
use meilisearch_types::VERSION_FILE_NAME;
|
use meilisearch_types::VERSION_FILE_NAME;
|
||||||
use tempfile::{NamedTempFile, TempDir};
|
use tempfile::{NamedTempFile, TempDir};
|
||||||
@@ -1836,7 +1826,6 @@ mod tests {
|
|||||||
assert_eq!(breakpoint, (Init, false));
|
assert_eq!(breakpoint, (Init, false));
|
||||||
let index_scheduler_handle = IndexSchedulerHandle {
|
let index_scheduler_handle = IndexSchedulerHandle {
|
||||||
_tempdir: tempdir,
|
_tempdir: tempdir,
|
||||||
index_scheduler: index_scheduler.private_clone(),
|
|
||||||
test_breakpoint_rcv: receiver,
|
test_breakpoint_rcv: receiver,
|
||||||
last_breakpoint: breakpoint.0,
|
last_breakpoint: breakpoint.0,
|
||||||
};
|
};
|
||||||
@@ -1925,7 +1914,6 @@ mod tests {
|
|||||||
|
|
||||||
pub struct IndexSchedulerHandle {
|
pub struct IndexSchedulerHandle {
|
||||||
_tempdir: TempDir,
|
_tempdir: TempDir,
|
||||||
index_scheduler: IndexScheduler,
|
|
||||||
test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>,
|
test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>,
|
||||||
last_breakpoint: Breakpoint,
|
last_breakpoint: Breakpoint,
|
||||||
}
|
}
|
||||||
@@ -1943,13 +1931,9 @@ mod tests {
|
|||||||
{
|
{
|
||||||
Ok(b) => b,
|
Ok(b) => b,
|
||||||
Err(RecvTimeoutError::Timeout) => {
|
Err(RecvTimeoutError::Timeout) => {
|
||||||
let state = snapshot_index_scheduler(&self.index_scheduler);
|
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.")
|
||||||
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}")
|
|
||||||
}
|
|
||||||
Err(RecvTimeoutError::Disconnected) => {
|
|
||||||
let state = snapshot_index_scheduler(&self.index_scheduler);
|
|
||||||
panic!("The scheduler crashed.\n{state}")
|
|
||||||
}
|
}
|
||||||
|
Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."),
|
||||||
};
|
};
|
||||||
// if we've already encountered a breakpoint we're supposed to be stuck on the false
|
// if we've already encountered a breakpoint we're supposed to be stuck on the false
|
||||||
// and we expect the same variant with the true to come now.
|
// and we expect the same variant with the true to come now.
|
||||||
@@ -1968,13 +1952,9 @@ mod tests {
|
|||||||
{
|
{
|
||||||
Ok(b) => b,
|
Ok(b) => b,
|
||||||
Err(RecvTimeoutError::Timeout) => {
|
Err(RecvTimeoutError::Timeout) => {
|
||||||
let state = snapshot_index_scheduler(&self.index_scheduler);
|
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.")
|
||||||
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}")
|
|
||||||
}
|
|
||||||
Err(RecvTimeoutError::Disconnected) => {
|
|
||||||
let state = snapshot_index_scheduler(&self.index_scheduler);
|
|
||||||
panic!("The scheduler crashed.\n{state}")
|
|
||||||
}
|
}
|
||||||
|
Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."),
|
||||||
};
|
};
|
||||||
assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite");
|
assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite");
|
||||||
|
|
||||||
@@ -1988,10 +1968,9 @@ mod tests {
|
|||||||
fn advance_till(&mut self, breakpoints: impl IntoIterator<Item = Breakpoint>) {
|
fn advance_till(&mut self, breakpoints: impl IntoIterator<Item = Breakpoint>) {
|
||||||
for breakpoint in breakpoints {
|
for breakpoint in breakpoints {
|
||||||
let b = self.advance();
|
let b = self.advance();
|
||||||
let state = snapshot_index_scheduler(&self.index_scheduler);
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
b, breakpoint,
|
b, breakpoint,
|
||||||
"Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{state}",
|
"Was expecting the breakpoint `{:?}` but instead got `{:?}`.",
|
||||||
breakpoint, b
|
breakpoint, b
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -2016,7 +1995,6 @@ mod tests {
|
|||||||
// Wait for one successful batch.
|
// Wait for one successful batch.
|
||||||
#[track_caller]
|
#[track_caller]
|
||||||
fn advance_one_successful_batch(&mut self) {
|
fn advance_one_successful_batch(&mut self) {
|
||||||
self.index_scheduler.assert_internally_consistent();
|
|
||||||
self.advance_till([Start, BatchCreated]);
|
self.advance_till([Start, BatchCreated]);
|
||||||
loop {
|
loop {
|
||||||
match self.advance() {
|
match self.advance() {
|
||||||
@@ -2025,17 +2003,13 @@ mod tests {
|
|||||||
InsideProcessBatch => (),
|
InsideProcessBatch => (),
|
||||||
// the batch went successfully, we can stop the loop and go on with the next states.
|
// the batch went successfully, we can stop the loop and go on with the next states.
|
||||||
ProcessBatchSucceeded => break,
|
ProcessBatchSucceeded => break,
|
||||||
AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)),
|
AbortedIndexation => panic!("The batch was aborted."),
|
||||||
ProcessBatchFailed => {
|
ProcessBatchFailed => panic!("The batch failed."),
|
||||||
while self.advance() != Start {}
|
|
||||||
panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler))
|
|
||||||
},
|
|
||||||
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
|
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.advance_till([AfterProcessing]);
|
self.advance_till([AfterProcessing]);
|
||||||
self.index_scheduler.assert_internally_consistent();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for one failed batch.
|
// Wait for one failed batch.
|
||||||
@@ -2049,8 +2023,8 @@ mod tests {
|
|||||||
InsideProcessBatch => (),
|
InsideProcessBatch => (),
|
||||||
// the batch went failed, we can stop the loop and go on with the next states.
|
// the batch went failed, we can stop the loop and go on with the next states.
|
||||||
ProcessBatchFailed => break,
|
ProcessBatchFailed => break,
|
||||||
ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)),
|
ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)"),
|
||||||
AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)),
|
AbortedIndexation => panic!("The batch was aborted."),
|
||||||
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
|
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3078,10 +3052,8 @@ mod tests {
|
|||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap();
|
let (_, embedding_config) = configs.first().unwrap();
|
||||||
insta::assert_snapshot!(name, @"default");
|
insta::assert_json_snapshot!(embedding_config.embedder_options);
|
||||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
|
||||||
insta::assert_json_snapshot!(config.embedder_options);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -5017,6 +4989,7 @@ mod tests {
|
|||||||
false,
|
false,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors");
|
||||||
|
|
||||||
@@ -5027,7 +5000,7 @@ mod tests {
|
|||||||
insta::assert_json_snapshot!(task.details);
|
insta::assert_json_snapshot!(task.details);
|
||||||
}
|
}
|
||||||
|
|
||||||
handle.advance_one_successful_batch();
|
handle.advance_n_successful_batches(1);
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors");
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -5044,17 +5017,13 @@ mod tests {
|
|||||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
// for consistency with the below
|
// for consistency with the below
|
||||||
#[allow(clippy::get_first)]
|
#[allow(clippy::get_first)]
|
||||||
let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } =
|
let (name, fakerest_config) = configs.get(0).unwrap();
|
||||||
configs.get(0).unwrap();
|
insta::assert_json_snapshot!(name, @r###""A_fakerest""###);
|
||||||
insta::assert_snapshot!(name, @"A_fakerest");
|
|
||||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
|
||||||
insta::assert_json_snapshot!(fakerest_config.embedder_options);
|
insta::assert_json_snapshot!(fakerest_config.embedder_options);
|
||||||
let fakerest_name = name.clone();
|
let fakerest_name = name.clone();
|
||||||
|
|
||||||
let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } =
|
let (name, simple_hf_config) = configs.get(1).unwrap();
|
||||||
configs.get(1).unwrap();
|
insta::assert_json_snapshot!(name, @r###""B_small_hf""###);
|
||||||
insta::assert_snapshot!(name, @"B_small_hf");
|
|
||||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
|
||||||
insta::assert_json_snapshot!(simple_hf_config.embedder_options);
|
insta::assert_json_snapshot!(simple_hf_config.embedder_options);
|
||||||
let simple_hf_name = name.clone();
|
let simple_hf_name = name.clone();
|
||||||
|
|
||||||
@@ -5069,25 +5038,25 @@ mod tests {
|
|||||||
// add one doc, specifying vectors
|
// add one doc, specifying vectors
|
||||||
|
|
||||||
let doc = serde_json::json!(
|
let doc = serde_json::json!(
|
||||||
{
|
{
|
||||||
"id": 0,
|
"id": 0,
|
||||||
"doggo": "Intel",
|
"doggo": "Intel",
|
||||||
"breed": "beagle",
|
"breed": "beagle",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
&fakerest_name: {
|
&fakerest_name: {
|
||||||
// this will never trigger regeneration, which is good because we can't actually generate with
|
// this will never trigger regeneration, which is good because we can't actually generate with
|
||||||
// this embedder
|
// this embedder
|
||||||
"regenerate": false,
|
"userProvided": true,
|
||||||
"embeddings": beagle_embed,
|
"embeddings": beagle_embed,
|
||||||
},
|
},
|
||||||
&simple_hf_name: {
|
&simple_hf_name: {
|
||||||
// this will be regenerated on updates
|
// this will be regenerated on updates
|
||||||
"regenerate": true,
|
"userProvided": false,
|
||||||
"embeddings": lab_embed,
|
"embeddings": lab_embed,
|
||||||
},
|
},
|
||||||
"noise": [0.1, 0.2, 0.3]
|
"noise": [0.1, 0.2, 0.3]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap();
|
||||||
@@ -5109,6 +5078,7 @@ mod tests {
|
|||||||
false,
|
false,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel");
|
||||||
|
|
||||||
@@ -5121,19 +5091,6 @@ mod tests {
|
|||||||
let index = index_scheduler.index("doggos").unwrap();
|
let index = index_scheduler.index("doggos").unwrap();
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
// Ensure the document have been inserted into the relevant bitamp
|
|
||||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
|
||||||
// for consistency with the below
|
|
||||||
#[allow(clippy::get_first)]
|
|
||||||
let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } =
|
|
||||||
configs.get(0).unwrap();
|
|
||||||
insta::assert_snapshot!(name, @"A_fakerest");
|
|
||||||
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
|
|
||||||
|
|
||||||
let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap();
|
|
||||||
insta::assert_snapshot!(name, @"B_small_hf");
|
|
||||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
|
||||||
|
|
||||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||||
|
|
||||||
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
|
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
|
||||||
@@ -5183,6 +5140,7 @@ mod tests {
|
|||||||
false,
|
false,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir");
|
||||||
|
|
||||||
@@ -5195,25 +5153,11 @@ mod tests {
|
|||||||
let index = index_scheduler.index("doggos").unwrap();
|
let index = index_scheduler.index("doggos").unwrap();
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
// Ensure the document have been inserted into the relevant bitamp
|
|
||||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
|
||||||
// for consistency with the below
|
|
||||||
#[allow(clippy::get_first)]
|
|
||||||
let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } =
|
|
||||||
configs.get(0).unwrap();
|
|
||||||
insta::assert_snapshot!(name, @"A_fakerest");
|
|
||||||
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
|
|
||||||
|
|
||||||
let IndexEmbeddingConfig { name, config: _, user_provided } =
|
|
||||||
configs.get(1).unwrap();
|
|
||||||
insta::assert_snapshot!(name, @"B_small_hf");
|
|
||||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
|
||||||
|
|
||||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||||
|
|
||||||
// automatically changed to patou because set to regenerate
|
// automatically changed to patou
|
||||||
assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true");
|
assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true");
|
||||||
// remained beagle
|
// remained beagle because set to userProvided
|
||||||
assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true");
|
assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true");
|
||||||
|
|
||||||
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
|
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
|
||||||
@@ -5232,578 +5176,4 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn import_vectors_first_and_embedder_later() {
|
|
||||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
|
||||||
|
|
||||||
let content = serde_json::json!(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"doggo": "kefir",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"doggo": "intel",
|
|
||||||
"_vectors": {
|
|
||||||
"my_doggo_embedder": vec![1; 384],
|
|
||||||
"unknown embedder": vec![1, 2, 3],
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"doggo": "max",
|
|
||||||
"_vectors": {
|
|
||||||
"my_doggo_embedder": {
|
|
||||||
"regenerate": false,
|
|
||||||
"embeddings": vec![2; 384],
|
|
||||||
},
|
|
||||||
"unknown embedder": vec![4, 5],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"doggo": "marcel",
|
|
||||||
"_vectors": {
|
|
||||||
"my_doggo_embedder": {
|
|
||||||
"regenerate": true,
|
|
||||||
"embeddings": vec![3; 384],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"doggo": "sora",
|
|
||||||
"_vectors": {
|
|
||||||
"my_doggo_embedder": {
|
|
||||||
"regenerate": true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
|
|
||||||
let documents_count =
|
|
||||||
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
|
||||||
.unwrap();
|
|
||||||
snapshot!(documents_count, @"5");
|
|
||||||
file.persist().unwrap();
|
|
||||||
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::DocumentAdditionOrUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
primary_key: None,
|
|
||||||
method: ReplaceDocuments,
|
|
||||||
content_file: uuid,
|
|
||||||
documents_count,
|
|
||||||
allow_index_creation: true,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
let index = index_scheduler.index("doggos").unwrap();
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
|
||||||
let documents = index
|
|
||||||
.all_documents(&rtxn)
|
|
||||||
.unwrap()
|
|
||||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");
|
|
||||||
|
|
||||||
let setting = meilisearch_types::settings::Settings::<Unchecked> {
|
|
||||||
embedders: Setting::Set(maplit::btreemap! {
|
|
||||||
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
|
|
||||||
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
|
|
||||||
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
|
|
||||||
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
|
|
||||||
document_template: Setting::Set(S("{{doc.doggo}}")),
|
|
||||||
..Default::default()
|
|
||||||
})
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::SettingsUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
new_settings: Box::new(setting),
|
|
||||||
is_deletion: false,
|
|
||||||
allow_index_creation: false,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
index_scheduler.assert_internally_consistent();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
index_scheduler.assert_internally_consistent();
|
|
||||||
|
|
||||||
let index = index_scheduler.index("doggos").unwrap();
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
|
||||||
let documents = index
|
|
||||||
.all_documents(&rtxn)
|
|
||||||
.unwrap()
|
|
||||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
// the all the vectors linked to the new specified embedder have been removed
|
|
||||||
// Only the unknown embedders stays in the document DB
|
|
||||||
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###);
|
|
||||||
let conf = index.embedding_configs(&rtxn).unwrap();
|
|
||||||
// even though we specified the vector for the ID 3, it shouldn't be marked
|
|
||||||
// as user provided since we explicitely marked it as NOT user provided.
|
|
||||||
snapshot!(format!("{conf:#?}"), @r###"
|
|
||||||
[
|
|
||||||
IndexEmbeddingConfig {
|
|
||||||
name: "my_doggo_embedder",
|
|
||||||
config: EmbeddingConfig {
|
|
||||||
embedder_options: HuggingFace(
|
|
||||||
EmbedderOptions {
|
|
||||||
model: "sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
revision: Some(
|
|
||||||
"e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
|
|
||||||
),
|
|
||||||
distribution: None,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
prompt: PromptData {
|
|
||||||
template: "{{doc.doggo}}",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
user_provided: RoaringBitmap<[1, 2]>,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
|
||||||
let embedding = &embeddings["my_doggo_embedder"];
|
|
||||||
assert!(!embedding.is_empty(), "{embedding:?}");
|
|
||||||
|
|
||||||
// the document with the id 3 should keep its original embedding
|
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
|
||||||
let mut embeddings = Vec::new();
|
|
||||||
|
|
||||||
'vectors: for i in 0..=u8::MAX {
|
|
||||||
let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy)
|
|
||||||
.map(Some)
|
|
||||||
.or_else(|e| match e {
|
|
||||||
arroy::Error::MissingMetadata(_) => Ok(None),
|
|
||||||
e => Err(e),
|
|
||||||
})
|
|
||||||
.transpose();
|
|
||||||
|
|
||||||
let Some(reader) = reader else {
|
|
||||||
break 'vectors;
|
|
||||||
};
|
|
||||||
|
|
||||||
let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap();
|
|
||||||
if let Some(embedding) = embedding {
|
|
||||||
embeddings.push(embedding)
|
|
||||||
} else {
|
|
||||||
break 'vectors;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
snapshot!(embeddings.len(), @"1");
|
|
||||||
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
|
|
||||||
|
|
||||||
// If we update marcel it should regenerate its embedding automatically
|
|
||||||
|
|
||||||
let content = serde_json::json!(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"doggo": "marvel",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"doggo": "sorry",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap();
|
|
||||||
let documents_count =
|
|
||||||
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
|
||||||
.unwrap();
|
|
||||||
snapshot!(documents_count, @"2");
|
|
||||||
file.persist().unwrap();
|
|
||||||
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::DocumentAdditionOrUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
primary_key: None,
|
|
||||||
method: UpdateDocuments,
|
|
||||||
content_file: uuid,
|
|
||||||
documents_count,
|
|
||||||
allow_index_creation: true,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
// the document with the id 3 should have its original embedding updated
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
|
||||||
let doc = index.documents(&rtxn, Some(docid)).unwrap()[0];
|
|
||||||
let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap();
|
|
||||||
snapshot!(json_string!(doc), @r###"
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"doggo": "marvel"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
|
||||||
let embedding = &embeddings["my_doggo_embedder"];
|
|
||||||
|
|
||||||
assert!(!embedding.is_empty());
|
|
||||||
assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
|
|
||||||
|
|
||||||
// the document with the id 4 should generate an embedding
|
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
|
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
|
||||||
let embedding = &embeddings["my_doggo_embedder"];
|
|
||||||
|
|
||||||
assert!(!embedding.is_empty());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn delete_document_containing_vector() {
|
|
||||||
// 1. Add an embedder
|
|
||||||
// 2. Push two documents containing a simple vector
|
|
||||||
// 3. Delete the first document
|
|
||||||
// 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore
|
|
||||||
// 5. Clear the index
|
|
||||||
// 6. The user defined roaring bitmap shouldn't contains the id of the second document
|
|
||||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
|
||||||
|
|
||||||
let setting = meilisearch_types::settings::Settings::<Unchecked> {
|
|
||||||
embedders: Setting::Set(maplit::btreemap! {
|
|
||||||
S("manual") => Setting::Set(EmbeddingSettings {
|
|
||||||
source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided),
|
|
||||||
dimensions: Setting::Set(3),
|
|
||||||
..Default::default()
|
|
||||||
})
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::SettingsUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
new_settings: Box::new(setting),
|
|
||||||
is_deletion: false,
|
|
||||||
allow_index_creation: true,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
let content = serde_json::json!(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"doggo": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": vec![0, 0, 0],
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"doggo": "intel",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": vec![1, 1, 1],
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
|
|
||||||
let documents_count =
|
|
||||||
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
|
||||||
.unwrap();
|
|
||||||
snapshot!(documents_count, @"2");
|
|
||||||
file.persist().unwrap();
|
|
||||||
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::DocumentAdditionOrUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
primary_key: None,
|
|
||||||
method: ReplaceDocuments,
|
|
||||||
content_file: uuid,
|
|
||||||
documents_count,
|
|
||||||
allow_index_creation: false,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::DocumentDeletion {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
documents_ids: vec![S("1")],
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
let index = index_scheduler.index("doggos").unwrap();
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
|
||||||
let documents = index
|
|
||||||
.all_documents(&rtxn)
|
|
||||||
.unwrap()
|
|
||||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###);
|
|
||||||
let conf = index.embedding_configs(&rtxn).unwrap();
|
|
||||||
snapshot!(format!("{conf:#?}"), @r###"
|
|
||||||
[
|
|
||||||
IndexEmbeddingConfig {
|
|
||||||
name: "manual",
|
|
||||||
config: EmbeddingConfig {
|
|
||||||
embedder_options: UserProvided(
|
|
||||||
EmbedderOptions {
|
|
||||||
dimensions: 3,
|
|
||||||
distribution: None,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
prompt: PromptData {
|
|
||||||
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
user_provided: RoaringBitmap<[0]>,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
|
||||||
let embedding = &embeddings["manual"];
|
|
||||||
assert!(!embedding.is_empty(), "{embedding:?}");
|
|
||||||
|
|
||||||
index_scheduler
|
|
||||||
.register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
let index = index_scheduler.index("doggos").unwrap();
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
|
||||||
let documents = index
|
|
||||||
.all_documents(&rtxn)
|
|
||||||
.unwrap()
|
|
||||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
snapshot!(serde_json::to_string(&documents).unwrap(), @"[]");
|
|
||||||
let conf = index.embedding_configs(&rtxn).unwrap();
|
|
||||||
snapshot!(format!("{conf:#?}"), @r###"
|
|
||||||
[
|
|
||||||
IndexEmbeddingConfig {
|
|
||||||
name: "manual",
|
|
||||||
config: EmbeddingConfig {
|
|
||||||
embedder_options: UserProvided(
|
|
||||||
EmbedderOptions {
|
|
||||||
dimensions: 3,
|
|
||||||
distribution: None,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
prompt: PromptData {
|
|
||||||
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
user_provided: RoaringBitmap<[]>,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn delete_embedder_with_user_provided_vectors() {
|
|
||||||
// 1. Add two embedders
|
|
||||||
// 2. Push two documents containing a simple vector
|
|
||||||
// 3. The documents must not contain the vectors after the update as they are in the vectors db
|
|
||||||
// 3. Delete the embedders
|
|
||||||
// 4. The documents contain the vectors again
|
|
||||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
|
||||||
|
|
||||||
let setting = meilisearch_types::settings::Settings::<Unchecked> {
|
|
||||||
embedders: Setting::Set(maplit::btreemap! {
|
|
||||||
S("manual") => Setting::Set(EmbeddingSettings {
|
|
||||||
source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided),
|
|
||||||
dimensions: Setting::Set(3),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
|
|
||||||
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
|
|
||||||
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
|
|
||||||
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
|
|
||||||
document_template: Setting::Set(S("{{doc.doggo}}")),
|
|
||||||
..Default::default()
|
|
||||||
}),
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::SettingsUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
new_settings: Box::new(setting),
|
|
||||||
is_deletion: false,
|
|
||||||
allow_index_creation: true,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
let content = serde_json::json!(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"doggo": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": vec![0, 0, 0],
|
|
||||||
"my_doggo_embedder": vec![1; 384],
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"doggo": "intel",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": vec![1, 1, 1],
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
|
|
||||||
let documents_count =
|
|
||||||
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
|
||||||
.unwrap();
|
|
||||||
snapshot!(documents_count, @"2");
|
|
||||||
file.persist().unwrap();
|
|
||||||
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::DocumentAdditionOrUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
primary_key: None,
|
|
||||||
method: ReplaceDocuments,
|
|
||||||
content_file: uuid,
|
|
||||||
documents_count,
|
|
||||||
allow_index_creation: false,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
|
|
||||||
{
|
|
||||||
let index = index_scheduler.index("doggos").unwrap();
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
|
||||||
let documents = index
|
|
||||||
.all_documents(&rtxn)
|
|
||||||
.unwrap()
|
|
||||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let setting = meilisearch_types::settings::Settings::<Unchecked> {
|
|
||||||
embedders: Setting::Set(maplit::btreemap! {
|
|
||||||
S("manual") => Setting::Reset,
|
|
||||||
}),
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::SettingsUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
new_settings: Box::new(setting),
|
|
||||||
is_deletion: false,
|
|
||||||
allow_index_creation: true,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let index = index_scheduler.index("doggos").unwrap();
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
|
||||||
let documents = index
|
|
||||||
.all_documents(&rtxn)
|
|
||||||
.unwrap()
|
|
||||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let setting = meilisearch_types::settings::Settings::<Unchecked> {
|
|
||||||
embedders: Setting::Reset,
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
index_scheduler
|
|
||||||
.register(
|
|
||||||
KindWithContent::SettingsUpdate {
|
|
||||||
index_uid: S("doggos"),
|
|
||||||
new_settings: Box::new(setting),
|
|
||||||
is_deletion: false,
|
|
||||||
allow_index_creation: true,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
handle.advance_one_successful_batch();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let index = index_scheduler.index("doggos").unwrap();
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
||||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
|
||||||
let documents = index
|
|
||||||
.all_documents(&rtxn)
|
|
||||||
.unwrap()
|
|
||||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
// FIXME: redaction
|
|
||||||
snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,10 @@ expression: doc
|
|||||||
"doggo": "kefir",
|
"doggo": "kefir",
|
||||||
"breed": "patou",
|
"breed": "patou",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
|
"A_fakerest": {
|
||||||
|
"embeddings": "[vector]",
|
||||||
|
"userProvided": true
|
||||||
|
},
|
||||||
"noise": [
|
"noise": [
|
||||||
0.1,
|
0.1,
|
||||||
0.2,
|
0.2,
|
||||||
@@ -6,6 +6,10 @@ expression: doc
|
|||||||
"doggo": "Intel",
|
"doggo": "Intel",
|
||||||
"breed": "beagle",
|
"breed": "beagle",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
|
"A_fakerest": {
|
||||||
|
"embeddings": "[vector]",
|
||||||
|
"userProvided": true
|
||||||
|
},
|
||||||
"noise": [
|
"noise": [
|
||||||
0.1,
|
0.1,
|
||||||
0.2,
|
0.2,
|
||||||
File diff suppressed because one or more lines are too long
@@ -222,7 +222,6 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ;
|
|||||||
InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
|
InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
|
||||||
InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ;
|
InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
|
InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ;
|
|
||||||
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
|
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
|
||||||
@@ -241,11 +240,9 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ;
|
|||||||
InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
|
InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ;
|
|
||||||
InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
|
InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ;
|
|
||||||
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;
|
||||||
@@ -273,14 +270,13 @@ InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ;
|
|||||||
InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
|
InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSearchSort , InvalidRequest , BAD_REQUEST ;
|
InvalidSearchSort , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ;
|
|
||||||
InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ;
|
||||||
InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ;
|
InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ;
|
||||||
@@ -385,7 +381,6 @@ impl ErrorCode for milli::Error {
|
|||||||
Code::IndexPrimaryKeyMultipleCandidatesFound
|
Code::IndexPrimaryKeyMultipleCandidatesFound
|
||||||
}
|
}
|
||||||
UserError::PrimaryKeyCannotBeChanged(_) => Code::IndexPrimaryKeyAlreadyExists,
|
UserError::PrimaryKeyCannotBeChanged(_) => Code::IndexPrimaryKeyAlreadyExists,
|
||||||
UserError::InvalidDistinctAttribute { .. } => Code::InvalidSearchDistinct,
|
|
||||||
UserError::SortRankingRuleMissing => Code::InvalidSearchSort,
|
UserError::SortRankingRuleMissing => Code::InvalidSearchSort,
|
||||||
UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets,
|
UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets,
|
||||||
UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort,
|
UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort,
|
||||||
@@ -398,8 +393,7 @@ impl ErrorCode for milli::Error {
|
|||||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
||||||
UserError::InvalidVectorsMapType { .. }
|
UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType,
|
||||||
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
|
|
||||||
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
||||||
UserError::SortError(_) => Code::InvalidSearchSort,
|
UserError::SortError(_) => Code::InvalidSearchSort,
|
||||||
UserError::InvalidMinTypoWordLenSetting(_, _) => {
|
UserError::InvalidMinTypoWordLenSetting(_, _) => {
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ use std::str::FromStr;
|
|||||||
|
|
||||||
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
|
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use milli::index::IndexEmbeddingConfig;
|
|
||||||
use milli::proximity::ProximityPrecision;
|
use milli::proximity::ProximityPrecision;
|
||||||
use milli::update::Setting;
|
use milli::update::Setting;
|
||||||
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
|
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
|
||||||
@@ -673,7 +672,7 @@ pub fn settings(
|
|||||||
let embedders: BTreeMap<_, _> = index
|
let embedders: BTreeMap<_, _> = index
|
||||||
.embedding_configs(rtxn)?
|
.embedding_configs(rtxn)?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into())))
|
.map(|(name, config)| (name, Setting::Set(config.into())))
|
||||||
.collect();
|
.collect();
|
||||||
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
||||||
|
|
||||||
|
|||||||
@@ -158,5 +158,5 @@ vietnamese = ["meilisearch-types/vietnamese"]
|
|||||||
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
||||||
|
|
||||||
[package.metadata.mini-dashboard]
|
[package.metadata.mini-dashboard]
|
||||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"
|
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
|
||||||
sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe"
|
sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff"
|
||||||
|
|||||||
@@ -74,8 +74,8 @@ pub enum DocumentDeletionKind {
|
|||||||
|
|
||||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||||
pub enum DocumentFetchKind {
|
pub enum DocumentFetchKind {
|
||||||
PerDocumentId { retrieve_vectors: bool },
|
PerDocumentId,
|
||||||
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
|
Normal { with_filter: bool, limit: usize, offset: usize },
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait Analytics: Sync + Send {
|
pub trait Analytics: Sync + Send {
|
||||||
|
|||||||
@@ -597,9 +597,6 @@ pub struct SearchAggregator {
|
|||||||
// every time a request has a filter, this field must be incremented by one
|
// every time a request has a filter, this field must be incremented by one
|
||||||
sort_total_number_of_criteria: usize,
|
sort_total_number_of_criteria: usize,
|
||||||
|
|
||||||
// distinct
|
|
||||||
distinct: bool,
|
|
||||||
|
|
||||||
// filter
|
// filter
|
||||||
filter_with_geo_radius: bool,
|
filter_with_geo_radius: bool,
|
||||||
filter_with_geo_bounding_box: bool,
|
filter_with_geo_bounding_box: bool,
|
||||||
@@ -625,7 +622,6 @@ pub struct SearchAggregator {
|
|||||||
// Whether a non-default embedder was specified
|
// Whether a non-default embedder was specified
|
||||||
embedder: bool,
|
embedder: bool,
|
||||||
hybrid: bool,
|
hybrid: bool,
|
||||||
retrieve_vectors: bool,
|
|
||||||
|
|
||||||
// every time a search is done, we increment the counter linked to the used settings
|
// every time a search is done, we increment the counter linked to the used settings
|
||||||
matching_strategy: HashMap<String, usize>,
|
matching_strategy: HashMap<String, usize>,
|
||||||
@@ -666,7 +662,6 @@ impl SearchAggregator {
|
|||||||
page,
|
page,
|
||||||
hits_per_page,
|
hits_per_page,
|
||||||
attributes_to_retrieve: _,
|
attributes_to_retrieve: _,
|
||||||
retrieve_vectors,
|
|
||||||
attributes_to_crop: _,
|
attributes_to_crop: _,
|
||||||
crop_length,
|
crop_length,
|
||||||
attributes_to_highlight: _,
|
attributes_to_highlight: _,
|
||||||
@@ -675,7 +670,6 @@ impl SearchAggregator {
|
|||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
filter,
|
filter,
|
||||||
sort,
|
sort,
|
||||||
distinct,
|
|
||||||
facets: _,
|
facets: _,
|
||||||
highlight_pre_tag,
|
highlight_pre_tag,
|
||||||
highlight_post_tag,
|
highlight_post_tag,
|
||||||
@@ -698,8 +692,6 @@ impl SearchAggregator {
|
|||||||
ret.sort_sum_of_criteria_terms = sort.len();
|
ret.sort_sum_of_criteria_terms = sort.len();
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.distinct = distinct.is_some();
|
|
||||||
|
|
||||||
if let Some(ref filter) = filter {
|
if let Some(ref filter) = filter {
|
||||||
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
|
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
|
||||||
ret.filter_total_number_of_criteria = 1;
|
ret.filter_total_number_of_criteria = 1;
|
||||||
@@ -736,7 +728,6 @@ impl SearchAggregator {
|
|||||||
if let Some(ref vector) = vector {
|
if let Some(ref vector) = vector {
|
||||||
ret.max_vector_size = vector.len();
|
ret.max_vector_size = vector.len();
|
||||||
}
|
}
|
||||||
ret.retrieve_vectors |= retrieve_vectors;
|
|
||||||
|
|
||||||
if query.is_finite_pagination() {
|
if query.is_finite_pagination() {
|
||||||
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
|
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
|
||||||
@@ -804,7 +795,6 @@ impl SearchAggregator {
|
|||||||
sort_with_geo_point,
|
sort_with_geo_point,
|
||||||
sort_sum_of_criteria_terms,
|
sort_sum_of_criteria_terms,
|
||||||
sort_total_number_of_criteria,
|
sort_total_number_of_criteria,
|
||||||
distinct,
|
|
||||||
filter_with_geo_radius,
|
filter_with_geo_radius,
|
||||||
filter_with_geo_bounding_box,
|
filter_with_geo_bounding_box,
|
||||||
filter_sum_of_criteria_terms,
|
filter_sum_of_criteria_terms,
|
||||||
@@ -813,7 +803,6 @@ impl SearchAggregator {
|
|||||||
attributes_to_search_on_total_number_of_uses,
|
attributes_to_search_on_total_number_of_uses,
|
||||||
max_terms_number,
|
max_terms_number,
|
||||||
max_vector_size,
|
max_vector_size,
|
||||||
retrieve_vectors,
|
|
||||||
matching_strategy,
|
matching_strategy,
|
||||||
max_limit,
|
max_limit,
|
||||||
max_offset,
|
max_offset,
|
||||||
@@ -862,9 +851,6 @@ impl SearchAggregator {
|
|||||||
self.sort_total_number_of_criteria =
|
self.sort_total_number_of_criteria =
|
||||||
self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria);
|
self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria);
|
||||||
|
|
||||||
// distinct
|
|
||||||
self.distinct |= distinct;
|
|
||||||
|
|
||||||
// filter
|
// filter
|
||||||
self.filter_with_geo_radius |= filter_with_geo_radius;
|
self.filter_with_geo_radius |= filter_with_geo_radius;
|
||||||
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
|
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
|
||||||
@@ -887,7 +873,6 @@ impl SearchAggregator {
|
|||||||
|
|
||||||
// vector
|
// vector
|
||||||
self.max_vector_size = self.max_vector_size.max(max_vector_size);
|
self.max_vector_size = self.max_vector_size.max(max_vector_size);
|
||||||
self.retrieve_vectors |= retrieve_vectors;
|
|
||||||
self.semantic_ratio |= semantic_ratio;
|
self.semantic_ratio |= semantic_ratio;
|
||||||
self.hybrid |= hybrid;
|
self.hybrid |= hybrid;
|
||||||
self.embedder |= embedder;
|
self.embedder |= embedder;
|
||||||
@@ -936,7 +921,6 @@ impl SearchAggregator {
|
|||||||
sort_with_geo_point,
|
sort_with_geo_point,
|
||||||
sort_sum_of_criteria_terms,
|
sort_sum_of_criteria_terms,
|
||||||
sort_total_number_of_criteria,
|
sort_total_number_of_criteria,
|
||||||
distinct,
|
|
||||||
filter_with_geo_radius,
|
filter_with_geo_radius,
|
||||||
filter_with_geo_bounding_box,
|
filter_with_geo_bounding_box,
|
||||||
filter_sum_of_criteria_terms,
|
filter_sum_of_criteria_terms,
|
||||||
@@ -945,7 +929,6 @@ impl SearchAggregator {
|
|||||||
attributes_to_search_on_total_number_of_uses,
|
attributes_to_search_on_total_number_of_uses,
|
||||||
max_terms_number,
|
max_terms_number,
|
||||||
max_vector_size,
|
max_vector_size,
|
||||||
retrieve_vectors,
|
|
||||||
matching_strategy,
|
matching_strategy,
|
||||||
max_limit,
|
max_limit,
|
||||||
max_offset,
|
max_offset,
|
||||||
@@ -994,7 +977,6 @@ impl SearchAggregator {
|
|||||||
"with_geoPoint": sort_with_geo_point,
|
"with_geoPoint": sort_with_geo_point,
|
||||||
"avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64),
|
"avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64),
|
||||||
},
|
},
|
||||||
"distinct": distinct,
|
|
||||||
"filter": {
|
"filter": {
|
||||||
"with_geoRadius": filter_with_geo_radius,
|
"with_geoRadius": filter_with_geo_radius,
|
||||||
"with_geoBoundingBox": filter_with_geo_bounding_box,
|
"with_geoBoundingBox": filter_with_geo_bounding_box,
|
||||||
@@ -1009,7 +991,6 @@ impl SearchAggregator {
|
|||||||
},
|
},
|
||||||
"vector": {
|
"vector": {
|
||||||
"max_vector_size": max_vector_size,
|
"max_vector_size": max_vector_size,
|
||||||
"retrieve_vectors": retrieve_vectors,
|
|
||||||
},
|
},
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"enabled": hybrid,
|
"enabled": hybrid,
|
||||||
@@ -1098,7 +1079,6 @@ impl MultiSearchAggregator {
|
|||||||
page: _,
|
page: _,
|
||||||
hits_per_page: _,
|
hits_per_page: _,
|
||||||
attributes_to_retrieve: _,
|
attributes_to_retrieve: _,
|
||||||
retrieve_vectors: _,
|
|
||||||
attributes_to_crop: _,
|
attributes_to_crop: _,
|
||||||
crop_length: _,
|
crop_length: _,
|
||||||
attributes_to_highlight: _,
|
attributes_to_highlight: _,
|
||||||
@@ -1107,7 +1087,6 @@ impl MultiSearchAggregator {
|
|||||||
show_matches_position: _,
|
show_matches_position: _,
|
||||||
filter: _,
|
filter: _,
|
||||||
sort: _,
|
sort: _,
|
||||||
distinct: _,
|
|
||||||
facets: _,
|
facets: _,
|
||||||
highlight_pre_tag: _,
|
highlight_pre_tag: _,
|
||||||
highlight_post_tag: _,
|
highlight_post_tag: _,
|
||||||
@@ -1555,9 +1534,6 @@ pub struct DocumentsFetchAggregator {
|
|||||||
// if a filter was used
|
// if a filter was used
|
||||||
per_filter: bool,
|
per_filter: bool,
|
||||||
|
|
||||||
#[serde(rename = "vector.retrieve_vectors")]
|
|
||||||
retrieve_vectors: bool,
|
|
||||||
|
|
||||||
// pagination
|
// pagination
|
||||||
#[serde(rename = "pagination.max_limit")]
|
#[serde(rename = "pagination.max_limit")]
|
||||||
max_limit: usize,
|
max_limit: usize,
|
||||||
@@ -1567,21 +1543,18 @@ pub struct DocumentsFetchAggregator {
|
|||||||
|
|
||||||
impl DocumentsFetchAggregator {
|
impl DocumentsFetchAggregator {
|
||||||
pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self {
|
pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self {
|
||||||
let (limit, offset, retrieve_vectors) = match query {
|
let (limit, offset) = match query {
|
||||||
DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors),
|
DocumentFetchKind::PerDocumentId => (1, 0),
|
||||||
DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => {
|
DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset),
|
||||||
(*limit, *offset, *retrieve_vectors)
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
Self {
|
Self {
|
||||||
timestamp: Some(OffsetDateTime::now_utc()),
|
timestamp: Some(OffsetDateTime::now_utc()),
|
||||||
user_agents: extract_user_agents(request).into_iter().collect(),
|
user_agents: extract_user_agents(request).into_iter().collect(),
|
||||||
total_received: 1,
|
total_received: 1,
|
||||||
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
|
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId),
|
||||||
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
|
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
|
||||||
max_limit: limit,
|
max_limit: limit,
|
||||||
max_offset: offset,
|
max_offset: offset,
|
||||||
retrieve_vectors,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1595,7 +1568,6 @@ impl DocumentsFetchAggregator {
|
|||||||
per_filter,
|
per_filter,
|
||||||
max_limit,
|
max_limit,
|
||||||
max_offset,
|
max_offset,
|
||||||
retrieve_vectors,
|
|
||||||
} = other;
|
} = other;
|
||||||
|
|
||||||
if self.timestamp.is_none() {
|
if self.timestamp.is_none() {
|
||||||
@@ -1611,8 +1583,6 @@ impl DocumentsFetchAggregator {
|
|||||||
|
|
||||||
self.max_limit = self.max_limit.max(max_limit);
|
self.max_limit = self.max_limit.max(max_limit);
|
||||||
self.max_offset = self.max_offset.max(max_offset);
|
self.max_offset = self.max_offset.max(max_offset);
|
||||||
|
|
||||||
self.retrieve_vectors |= retrieve_vectors;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
|
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
|
||||||
@@ -1653,7 +1623,6 @@ pub struct SimilarAggregator {
|
|||||||
|
|
||||||
// Whether a non-default embedder was specified
|
// Whether a non-default embedder was specified
|
||||||
embedder: bool,
|
embedder: bool,
|
||||||
retrieve_vectors: bool,
|
|
||||||
|
|
||||||
// pagination
|
// pagination
|
||||||
max_limit: usize,
|
max_limit: usize,
|
||||||
@@ -1677,7 +1646,6 @@ impl SimilarAggregator {
|
|||||||
offset,
|
offset,
|
||||||
limit,
|
limit,
|
||||||
attributes_to_retrieve: _,
|
attributes_to_retrieve: _,
|
||||||
retrieve_vectors,
|
|
||||||
show_ranking_score,
|
show_ranking_score,
|
||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
filter,
|
filter,
|
||||||
@@ -1722,7 +1690,6 @@ impl SimilarAggregator {
|
|||||||
ret.ranking_score_threshold = ranking_score_threshold.is_some();
|
ret.ranking_score_threshold = ranking_score_threshold.is_some();
|
||||||
|
|
||||||
ret.embedder = embedder.is_some();
|
ret.embedder = embedder.is_some();
|
||||||
ret.retrieve_vectors = *retrieve_vectors;
|
|
||||||
|
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
@@ -1755,7 +1722,6 @@ impl SimilarAggregator {
|
|||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
embedder,
|
embedder,
|
||||||
ranking_score_threshold,
|
ranking_score_threshold,
|
||||||
retrieve_vectors,
|
|
||||||
} = other;
|
} = other;
|
||||||
|
|
||||||
if self.timestamp.is_none() {
|
if self.timestamp.is_none() {
|
||||||
@@ -1785,7 +1751,6 @@ impl SimilarAggregator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
self.embedder |= embedder;
|
self.embedder |= embedder;
|
||||||
self.retrieve_vectors |= retrieve_vectors;
|
|
||||||
|
|
||||||
// pagination
|
// pagination
|
||||||
self.max_limit = self.max_limit.max(max_limit);
|
self.max_limit = self.max_limit.max(max_limit);
|
||||||
@@ -1820,7 +1785,6 @@ impl SimilarAggregator {
|
|||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
embedder,
|
embedder,
|
||||||
ranking_score_threshold,
|
ranking_score_threshold,
|
||||||
retrieve_vectors,
|
|
||||||
} = self;
|
} = self;
|
||||||
|
|
||||||
if total_received == 0 {
|
if total_received == 0 {
|
||||||
@@ -1847,9 +1811,6 @@ impl SimilarAggregator {
|
|||||||
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
||||||
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||||
},
|
},
|
||||||
"vector": {
|
|
||||||
"retrieve_vectors": retrieve_vectors,
|
|
||||||
},
|
|
||||||
"hybrid": {
|
"hybrid": {
|
||||||
"embedder": embedder,
|
"embedder": embedder,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ use meilisearch_types::error::{Code, ResponseError};
|
|||||||
use meilisearch_types::heed::RoTxn;
|
use meilisearch_types::heed::RoTxn;
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
use meilisearch_types::milli::update::IndexDocumentsMethod;
|
use meilisearch_types::milli::update::IndexDocumentsMethod;
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
|
||||||
use meilisearch_types::milli::DocumentId;
|
use meilisearch_types::milli::DocumentId;
|
||||||
use meilisearch_types::star_or::OptionStarOrList;
|
use meilisearch_types::star_or::OptionStarOrList;
|
||||||
use meilisearch_types::tasks::KindWithContent;
|
use meilisearch_types::tasks::KindWithContent;
|
||||||
@@ -40,7 +39,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
|
|||||||
use crate::routes::{
|
use crate::routes::{
|
||||||
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
|
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
|
||||||
};
|
};
|
||||||
use crate::search::{parse_filter, RetrieveVectors};
|
use crate::search::parse_filter;
|
||||||
use crate::Opt;
|
use crate::Opt;
|
||||||
|
|
||||||
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
|
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
|
||||||
@@ -95,8 +94,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
|||||||
pub struct GetDocument {
|
pub struct GetDocument {
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
|
||||||
fields: OptionStarOrList<String>,
|
fields: OptionStarOrList<String>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
|
|
||||||
retrieve_vectors: Param<bool>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_document(
|
pub async fn get_document(
|
||||||
@@ -110,20 +107,13 @@ pub async fn get_document(
|
|||||||
debug!(parameters = ?params, "Get document");
|
debug!(parameters = ?params, "Get document");
|
||||||
let index_uid = IndexUid::try_from(index_uid)?;
|
let index_uid = IndexUid::try_from(index_uid)?;
|
||||||
|
|
||||||
let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner();
|
analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req);
|
||||||
|
|
||||||
|
let GetDocument { fields } = params.into_inner();
|
||||||
let attributes_to_retrieve = fields.merge_star_and_none();
|
let attributes_to_retrieve = fields.merge_star_and_none();
|
||||||
|
|
||||||
let features = index_scheduler.features();
|
|
||||||
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
|
|
||||||
|
|
||||||
analytics.get_fetch_documents(
|
|
||||||
&DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
|
|
||||||
&req,
|
|
||||||
);
|
|
||||||
|
|
||||||
let index = index_scheduler.index(&index_uid)?;
|
let index = index_scheduler.index(&index_uid)?;
|
||||||
let document =
|
let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?;
|
||||||
retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?;
|
|
||||||
debug!(returns = ?document, "Get document");
|
debug!(returns = ?document, "Get document");
|
||||||
Ok(HttpResponse::Ok().json(document))
|
Ok(HttpResponse::Ok().json(document))
|
||||||
}
|
}
|
||||||
@@ -163,8 +153,6 @@ pub struct BrowseQueryGet {
|
|||||||
limit: Param<usize>,
|
limit: Param<usize>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
|
||||||
fields: OptionStarOrList<String>,
|
fields: OptionStarOrList<String>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
|
|
||||||
retrieve_vectors: Param<bool>,
|
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)]
|
||||||
filter: Option<String>,
|
filter: Option<String>,
|
||||||
}
|
}
|
||||||
@@ -178,8 +166,6 @@ pub struct BrowseQuery {
|
|||||||
limit: usize,
|
limit: usize,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)]
|
#[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)]
|
||||||
fields: Option<Vec<String>>,
|
fields: Option<Vec<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentRetrieveVectors>)]
|
|
||||||
retrieve_vectors: bool,
|
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
|
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
|
||||||
filter: Option<Value>,
|
filter: Option<Value>,
|
||||||
}
|
}
|
||||||
@@ -199,7 +185,6 @@ pub async fn documents_by_query_post(
|
|||||||
with_filter: body.filter.is_some(),
|
with_filter: body.filter.is_some(),
|
||||||
limit: body.limit,
|
limit: body.limit,
|
||||||
offset: body.offset,
|
offset: body.offset,
|
||||||
retrieve_vectors: body.retrieve_vectors,
|
|
||||||
},
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
@@ -216,7 +201,7 @@ pub async fn get_documents(
|
|||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
debug!(parameters = ?params, "Get documents GET");
|
debug!(parameters = ?params, "Get documents GET");
|
||||||
|
|
||||||
let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner();
|
let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner();
|
||||||
|
|
||||||
let filter = match filter {
|
let filter = match filter {
|
||||||
Some(f) => match serde_json::from_str(&f) {
|
Some(f) => match serde_json::from_str(&f) {
|
||||||
@@ -230,7 +215,6 @@ pub async fn get_documents(
|
|||||||
offset: offset.0,
|
offset: offset.0,
|
||||||
limit: limit.0,
|
limit: limit.0,
|
||||||
fields: fields.merge_star_and_none(),
|
fields: fields.merge_star_and_none(),
|
||||||
retrieve_vectors: retrieve_vectors.0,
|
|
||||||
filter,
|
filter,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -239,7 +223,6 @@ pub async fn get_documents(
|
|||||||
with_filter: query.filter.is_some(),
|
with_filter: query.filter.is_some(),
|
||||||
limit: query.limit,
|
limit: query.limit,
|
||||||
offset: query.offset,
|
offset: query.offset,
|
||||||
retrieve_vectors: query.retrieve_vectors,
|
|
||||||
},
|
},
|
||||||
&req,
|
&req,
|
||||||
);
|
);
|
||||||
@@ -253,14 +236,10 @@ fn documents_by_query(
|
|||||||
query: BrowseQuery,
|
query: BrowseQuery,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||||
let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query;
|
let BrowseQuery { offset, limit, fields, filter } = query;
|
||||||
|
|
||||||
let features = index_scheduler.features();
|
|
||||||
let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?;
|
|
||||||
|
|
||||||
let index = index_scheduler.index(&index_uid)?;
|
let index = index_scheduler.index(&index_uid)?;
|
||||||
let (total, documents) =
|
let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?;
|
||||||
retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?;
|
|
||||||
|
|
||||||
let ret = PaginationView::new(offset, limit, total as usize, documents);
|
let ret = PaginationView::new(offset, limit, total as usize, documents);
|
||||||
|
|
||||||
@@ -600,44 +579,13 @@ fn some_documents<'a, 't: 'a>(
|
|||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
rtxn: &'t RoTxn,
|
rtxn: &'t RoTxn,
|
||||||
doc_ids: impl IntoIterator<Item = DocumentId> + 'a,
|
doc_ids: impl IntoIterator<Item = DocumentId> + 'a,
|
||||||
retrieve_vectors: RetrieveVectors,
|
|
||||||
) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
|
) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
|
||||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||||
let embedding_configs = index.embedding_configs(rtxn)?;
|
|
||||||
|
|
||||||
Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
|
Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
|
||||||
ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> {
|
ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> {
|
||||||
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
|
Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?)
|
||||||
match retrieve_vectors {
|
|
||||||
RetrieveVectors::Ignore => {}
|
|
||||||
RetrieveVectors::Hide => {
|
|
||||||
document.remove("_vectors");
|
|
||||||
}
|
|
||||||
RetrieveVectors::Retrieve => {
|
|
||||||
let mut vectors = match document.remove("_vectors") {
|
|
||||||
Some(Value::Object(map)) => map,
|
|
||||||
_ => Default::default(),
|
|
||||||
};
|
|
||||||
for (name, vector) in index.embeddings(rtxn, key)? {
|
|
||||||
let user_provided = embedding_configs
|
|
||||||
.iter()
|
|
||||||
.find(|conf| conf.name == name)
|
|
||||||
.is_some_and(|conf| conf.user_provided.contains(key));
|
|
||||||
let embeddings = ExplicitVectors {
|
|
||||||
embeddings: Some(vector.into()),
|
|
||||||
regenerate: !user_provided,
|
|
||||||
};
|
|
||||||
vectors.insert(
|
|
||||||
name,
|
|
||||||
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
document.insert("_vectors".into(), vectors.into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(document)
|
|
||||||
})
|
})
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@@ -648,7 +596,6 @@ fn retrieve_documents<S: AsRef<str>>(
|
|||||||
limit: usize,
|
limit: usize,
|
||||||
filter: Option<Value>,
|
filter: Option<Value>,
|
||||||
attributes_to_retrieve: Option<Vec<S>>,
|
attributes_to_retrieve: Option<Vec<S>>,
|
||||||
retrieve_vectors: RetrieveVectors,
|
|
||||||
) -> Result<(u64, Vec<Document>), ResponseError> {
|
) -> Result<(u64, Vec<Document>), ResponseError> {
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
let filter = &filter;
|
let filter = &filter;
|
||||||
@@ -673,57 +620,53 @@ fn retrieve_documents<S: AsRef<str>>(
|
|||||||
let (it, number_of_documents) = {
|
let (it, number_of_documents) = {
|
||||||
let number_of_documents = candidates.len();
|
let number_of_documents = candidates.len();
|
||||||
(
|
(
|
||||||
some_documents(
|
some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?,
|
||||||
index,
|
|
||||||
&rtxn,
|
|
||||||
candidates.into_iter().skip(offset).take(limit),
|
|
||||||
retrieve_vectors,
|
|
||||||
)?,
|
|
||||||
number_of_documents,
|
number_of_documents,
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
let documents: Vec<_> = it
|
let documents: Result<Vec<_>, ResponseError> = it
|
||||||
.map(|document| {
|
.map(|document| {
|
||||||
Ok(match &attributes_to_retrieve {
|
Ok(match &attributes_to_retrieve {
|
||||||
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
||||||
&document?,
|
&document?,
|
||||||
attributes_to_retrieve.iter().map(|s| s.as_ref()).chain(
|
attributes_to_retrieve.iter().map(|s| s.as_ref()),
|
||||||
(retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
|
|
||||||
),
|
|
||||||
),
|
),
|
||||||
None => document?,
|
None => document?,
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.collect::<Result<_, ResponseError>>()?;
|
.collect();
|
||||||
|
|
||||||
Ok((number_of_documents, documents))
|
Ok((number_of_documents, documents?))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn retrieve_document<S: AsRef<str>>(
|
fn retrieve_document<S: AsRef<str>>(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
doc_id: &str,
|
doc_id: &str,
|
||||||
attributes_to_retrieve: Option<Vec<S>>,
|
attributes_to_retrieve: Option<Vec<S>>,
|
||||||
retrieve_vectors: RetrieveVectors,
|
|
||||||
) -> Result<Document, ResponseError> {
|
) -> Result<Document, ResponseError> {
|
||||||
let txn = index.read_txn()?;
|
let txn = index.read_txn()?;
|
||||||
|
|
||||||
|
let fields_ids_map = index.fields_ids_map(&txn)?;
|
||||||
|
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||||
|
|
||||||
let internal_id = index
|
let internal_id = index
|
||||||
.external_documents_ids()
|
.external_documents_ids()
|
||||||
.get(&txn, doc_id)?
|
.get(&txn, doc_id)?
|
||||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||||
|
|
||||||
let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)?
|
let document = index
|
||||||
|
.documents(&txn, std::iter::once(internal_id))?
|
||||||
|
.into_iter()
|
||||||
.next()
|
.next()
|
||||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??;
|
.map(|(_, d)| d)
|
||||||
|
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||||
|
|
||||||
|
let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
|
||||||
let document = match &attributes_to_retrieve {
|
let document = match &attributes_to_retrieve {
|
||||||
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
||||||
&document,
|
&document,
|
||||||
attributes_to_retrieve
|
attributes_to_retrieve.iter().map(|s| s.as_ref()),
|
||||||
.iter()
|
|
||||||
.map(|s| s.as_ref())
|
|
||||||
.chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
|
|
||||||
),
|
),
|
||||||
None => document,
|
None => document,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -115,7 +115,6 @@ impl From<FacetSearchQuery> for SearchQuery {
|
|||||||
page: None,
|
page: None,
|
||||||
hits_per_page: None,
|
hits_per_page: None,
|
||||||
attributes_to_retrieve: None,
|
attributes_to_retrieve: None,
|
||||||
retrieve_vectors: false,
|
|
||||||
attributes_to_crop: None,
|
attributes_to_crop: None,
|
||||||
crop_length: DEFAULT_CROP_LENGTH(),
|
crop_length: DEFAULT_CROP_LENGTH(),
|
||||||
attributes_to_highlight: None,
|
attributes_to_highlight: None,
|
||||||
@@ -124,7 +123,6 @@ impl From<FacetSearchQuery> for SearchQuery {
|
|||||||
show_ranking_score_details: false,
|
show_ranking_score_details: false,
|
||||||
filter,
|
filter,
|
||||||
sort: None,
|
sort: None,
|
||||||
distinct: None,
|
|
||||||
facets: None,
|
facets: None,
|
||||||
highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(),
|
highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(),
|
||||||
highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(),
|
highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(),
|
||||||
|
|||||||
@@ -20,9 +20,9 @@ use crate::extractors::sequential_extractor::SeqHandler;
|
|||||||
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
|
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
|
||||||
use crate::search::{
|
use crate::search::{
|
||||||
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
||||||
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
|
SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
|
||||||
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
|
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
|
||||||
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
|
DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
|
||||||
};
|
};
|
||||||
use crate::search_queue::SearchQueue;
|
use crate::search_queue::SearchQueue;
|
||||||
|
|
||||||
@@ -51,8 +51,6 @@ pub struct SearchQueryGet {
|
|||||||
hits_per_page: Option<Param<usize>>,
|
hits_per_page: Option<Param<usize>>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)]
|
||||||
attributes_to_retrieve: Option<CS<String>>,
|
attributes_to_retrieve: Option<CS<String>>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchRetrieveVectors>)]
|
|
||||||
retrieve_vectors: Param<bool>,
|
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)]
|
||||||
attributes_to_crop: Option<CS<String>>,
|
attributes_to_crop: Option<CS<String>>,
|
||||||
#[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)]
|
#[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)]
|
||||||
@@ -63,8 +61,6 @@ pub struct SearchQueryGet {
|
|||||||
filter: Option<String>,
|
filter: Option<String>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchSort>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSearchSort>)]
|
||||||
sort: Option<String>,
|
sort: Option<String>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchDistinct>)]
|
|
||||||
distinct: Option<String>,
|
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowMatchesPosition>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowMatchesPosition>)]
|
||||||
show_matches_position: Param<bool>,
|
show_matches_position: Param<bool>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowRankingScore>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowRankingScore>)]
|
||||||
@@ -157,13 +153,11 @@ impl From<SearchQueryGet> for SearchQuery {
|
|||||||
page: other.page.as_deref().copied(),
|
page: other.page.as_deref().copied(),
|
||||||
hits_per_page: other.hits_per_page.as_deref().copied(),
|
hits_per_page: other.hits_per_page.as_deref().copied(),
|
||||||
attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()),
|
attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()),
|
||||||
retrieve_vectors: other.retrieve_vectors.0,
|
|
||||||
attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()),
|
attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()),
|
||||||
crop_length: other.crop_length.0,
|
crop_length: other.crop_length.0,
|
||||||
attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()),
|
attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()),
|
||||||
filter,
|
filter,
|
||||||
sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)),
|
sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)),
|
||||||
distinct: other.distinct,
|
|
||||||
show_matches_position: other.show_matches_position.0,
|
show_matches_position: other.show_matches_position.0,
|
||||||
show_ranking_score: other.show_ranking_score.0,
|
show_ranking_score: other.show_ranking_score.0,
|
||||||
show_ranking_score_details: other.show_ranking_score_details.0,
|
show_ranking_score_details: other.show_ranking_score_details.0,
|
||||||
@@ -228,12 +222,10 @@ pub async fn search_with_url_query(
|
|||||||
let features = index_scheduler.features();
|
let features = index_scheduler.features();
|
||||||
|
|
||||||
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
|
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
|
||||||
let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?;
|
|
||||||
let _permit = search_queue.try_get_search_permit().await?;
|
let _permit = search_queue.try_get_search_permit().await?;
|
||||||
let search_result = tokio::task::spawn_blocking(move || {
|
let search_result =
|
||||||
perform_search(&index, query, search_kind, retrieve_vector)
|
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
if let Ok(ref search_result) = search_result {
|
if let Ok(ref search_result) = search_result {
|
||||||
aggregate.succeed(search_result);
|
aggregate.succeed(search_result);
|
||||||
}
|
}
|
||||||
@@ -270,13 +262,10 @@ pub async fn search_with_post(
|
|||||||
let features = index_scheduler.features();
|
let features = index_scheduler.features();
|
||||||
|
|
||||||
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
|
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
|
||||||
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
|
|
||||||
|
|
||||||
let _permit = search_queue.try_get_search_permit().await?;
|
let _permit = search_queue.try_get_search_permit().await?;
|
||||||
let search_result = tokio::task::spawn_blocking(move || {
|
let search_result =
|
||||||
perform_search(&index, query, search_kind, retrieve_vectors)
|
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
if let Ok(ref search_result) = search_result {
|
if let Ok(ref search_result) = search_result {
|
||||||
aggregate.succeed(search_result);
|
aggregate.succeed(search_result);
|
||||||
if search_result.degraded {
|
if search_result.degraded {
|
||||||
@@ -298,10 +287,11 @@ pub fn search_kind(
|
|||||||
features: RoFeatures,
|
features: RoFeatures,
|
||||||
) -> Result<SearchKind, ResponseError> {
|
) -> Result<SearchKind, ResponseError> {
|
||||||
if query.vector.is_some() {
|
if query.vector.is_some() {
|
||||||
features.check_vector("Passing `vector` as a parameter")?;
|
features.check_vector("Passing `vector` as a query parameter")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if query.hybrid.is_some() {
|
if query.hybrid.is_some() {
|
||||||
features.check_vector("Passing `hybrid` as a parameter")?;
|
features.check_vector("Passing `hybrid` as a query parameter")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing
|
// regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing
|
||||||
|
|||||||
@@ -4,7 +4,11 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter};
|
|||||||
use index_scheduler::IndexScheduler;
|
use index_scheduler::IndexScheduler;
|
||||||
use meilisearch_types::deserr::query_params::Param;
|
use meilisearch_types::deserr::query_params::Param;
|
||||||
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
|
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
|
||||||
use meilisearch_types::error::deserr_codes::*;
|
use meilisearch_types::error::deserr_codes::{
|
||||||
|
InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId,
|
||||||
|
InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold,
|
||||||
|
InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails,
|
||||||
|
};
|
||||||
use meilisearch_types::error::{ErrorCode as _, ResponseError};
|
use meilisearch_types::error::{ErrorCode as _, ResponseError};
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
use meilisearch_types::keys::actions;
|
use meilisearch_types::keys::actions;
|
||||||
@@ -17,8 +21,8 @@ use crate::analytics::{Analytics, SimilarAggregator};
|
|||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
use crate::extractors::sequential_extractor::SeqHandler;
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
use crate::search::{
|
use crate::search::{
|
||||||
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
|
add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery,
|
||||||
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||||
@@ -93,8 +97,6 @@ async fn similar(
|
|||||||
|
|
||||||
features.check_vector("Using the similar API")?;
|
features.check_vector("Using the similar API")?;
|
||||||
|
|
||||||
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
|
|
||||||
|
|
||||||
// Tenant token search_rules.
|
// Tenant token search_rules.
|
||||||
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) {
|
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) {
|
||||||
add_search_rules(&mut query.filter, search_rules);
|
add_search_rules(&mut query.filter, search_rules);
|
||||||
@@ -105,10 +107,8 @@ async fn similar(
|
|||||||
let (embedder_name, embedder) =
|
let (embedder_name, embedder) =
|
||||||
SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?;
|
SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?;
|
||||||
|
|
||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder))
|
||||||
perform_similar(&index, query, embedder_name, embedder, retrieve_vectors)
|
.await?
|
||||||
})
|
|
||||||
.await?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, deserr::Deserr)]
|
#[derive(Debug, deserr::Deserr)]
|
||||||
@@ -122,8 +122,6 @@ pub struct SimilarQueryGet {
|
|||||||
limit: Param<usize>,
|
limit: Param<usize>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)]
|
||||||
attributes_to_retrieve: Option<CS<String>>,
|
attributes_to_retrieve: Option<CS<String>>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarRetrieveVectors>)]
|
|
||||||
retrieve_vectors: Param<bool>,
|
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)]
|
||||||
filter: Option<String>,
|
filter: Option<String>,
|
||||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)]
|
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)]
|
||||||
@@ -158,7 +156,6 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
|
|||||||
offset,
|
offset,
|
||||||
limit,
|
limit,
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
retrieve_vectors,
|
|
||||||
filter,
|
filter,
|
||||||
show_ranking_score,
|
show_ranking_score,
|
||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
@@ -183,7 +180,6 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
|
|||||||
filter,
|
filter,
|
||||||
embedder,
|
embedder,
|
||||||
attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()),
|
attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()),
|
||||||
retrieve_vectors: retrieve_vectors.0,
|
|
||||||
show_ranking_score: show_ranking_score.0,
|
show_ranking_score: show_ranking_score.0,
|
||||||
show_ranking_score_details: show_ranking_score_details.0,
|
show_ranking_score_details: show_ranking_score_details.0,
|
||||||
ranking_score_threshold: ranking_score_threshold.map(|x| x.0),
|
ranking_score_threshold: ranking_score_threshold.map(|x| x.0),
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
|||||||
use crate::extractors::sequential_extractor::SeqHandler;
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
use crate::routes::indexes::search::search_kind;
|
use crate::routes::indexes::search::search_kind;
|
||||||
use crate::search::{
|
use crate::search::{
|
||||||
add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex,
|
add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex,
|
||||||
};
|
};
|
||||||
use crate::search_queue::SearchQueue;
|
use crate::search_queue::SearchQueue;
|
||||||
|
|
||||||
@@ -83,14 +83,11 @@ pub async fn multi_search_with_post(
|
|||||||
|
|
||||||
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)
|
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)
|
||||||
.with_index(query_index)?;
|
.with_index(query_index)?;
|
||||||
let retrieve_vector =
|
|
||||||
RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?;
|
|
||||||
|
|
||||||
let search_result = tokio::task::spawn_blocking(move || {
|
let search_result =
|
||||||
perform_search(&index, query, search_kind, retrieve_vector)
|
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind))
|
||||||
})
|
.await
|
||||||
.await
|
.with_index(query_index)?;
|
||||||
.with_index(query_index)?;
|
|
||||||
|
|
||||||
search_results.push(SearchResultWithIndex {
|
search_results.push(SearchResultWithIndex {
|
||||||
index_uid: index_uid.into_inner(),
|
index_uid: index_uid.into_inner(),
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ use meilisearch_types::error::{Code, ResponseError};
|
|||||||
use meilisearch_types::heed::RoTxn;
|
use meilisearch_types::heed::RoTxn;
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
|
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
|
||||||
use meilisearch_types::milli::vector::Embedder;
|
use meilisearch_types::milli::vector::Embedder;
|
||||||
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget};
|
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget};
|
||||||
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
||||||
@@ -60,8 +59,6 @@ pub struct SearchQuery {
|
|||||||
pub hits_per_page: Option<usize>,
|
pub hits_per_page: Option<usize>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
|
||||||
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
|
|
||||||
pub retrieve_vectors: bool,
|
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
|
||||||
pub attributes_to_crop: Option<Vec<String>>,
|
pub attributes_to_crop: Option<Vec<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
|
||||||
@@ -78,8 +75,6 @@ pub struct SearchQuery {
|
|||||||
pub filter: Option<Value>,
|
pub filter: Option<Value>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
|
||||||
pub sort: Option<Vec<String>>,
|
pub sort: Option<Vec<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchDistinct>)]
|
|
||||||
pub distinct: Option<String>,
|
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
|
||||||
pub facets: Option<Vec<String>>,
|
pub facets: Option<Vec<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
|
||||||
@@ -146,7 +141,6 @@ impl fmt::Debug for SearchQuery {
|
|||||||
page,
|
page,
|
||||||
hits_per_page,
|
hits_per_page,
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
retrieve_vectors,
|
|
||||||
attributes_to_crop,
|
attributes_to_crop,
|
||||||
crop_length,
|
crop_length,
|
||||||
attributes_to_highlight,
|
attributes_to_highlight,
|
||||||
@@ -155,7 +149,6 @@ impl fmt::Debug for SearchQuery {
|
|||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
filter,
|
filter,
|
||||||
sort,
|
sort,
|
||||||
distinct,
|
|
||||||
facets,
|
facets,
|
||||||
highlight_pre_tag,
|
highlight_pre_tag,
|
||||||
highlight_post_tag,
|
highlight_post_tag,
|
||||||
@@ -180,9 +173,6 @@ impl fmt::Debug for SearchQuery {
|
|||||||
if let Some(q) = q {
|
if let Some(q) = q {
|
||||||
debug.field("q", &q);
|
debug.field("q", &q);
|
||||||
}
|
}
|
||||||
if *retrieve_vectors {
|
|
||||||
debug.field("retrieve_vectors", &retrieve_vectors);
|
|
||||||
}
|
|
||||||
if let Some(v) = vector {
|
if let Some(v) = vector {
|
||||||
if v.len() < 10 {
|
if v.len() < 10 {
|
||||||
debug.field("vector", &v);
|
debug.field("vector", &v);
|
||||||
@@ -205,9 +195,6 @@ impl fmt::Debug for SearchQuery {
|
|||||||
if let Some(sort) = sort {
|
if let Some(sort) = sort {
|
||||||
debug.field("sort", &sort);
|
debug.field("sort", &sort);
|
||||||
}
|
}
|
||||||
if let Some(distinct) = distinct {
|
|
||||||
debug.field("distinct", &distinct);
|
|
||||||
}
|
|
||||||
if let Some(facets) = facets {
|
if let Some(facets) = facets {
|
||||||
debug.field("facets", &facets);
|
debug.field("facets", &facets);
|
||||||
}
|
}
|
||||||
@@ -383,8 +370,6 @@ pub struct SearchQueryWithIndex {
|
|||||||
pub hits_per_page: Option<usize>,
|
pub hits_per_page: Option<usize>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
|
||||||
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
|
|
||||||
pub retrieve_vectors: bool,
|
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
|
||||||
pub attributes_to_crop: Option<Vec<String>>,
|
pub attributes_to_crop: Option<Vec<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
|
||||||
@@ -401,8 +386,6 @@ pub struct SearchQueryWithIndex {
|
|||||||
pub filter: Option<Value>,
|
pub filter: Option<Value>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
|
||||||
pub sort: Option<Vec<String>>,
|
pub sort: Option<Vec<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchDistinct>)]
|
|
||||||
pub distinct: Option<String>,
|
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
|
||||||
pub facets: Option<Vec<String>>,
|
pub facets: Option<Vec<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
|
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
|
||||||
@@ -430,7 +413,6 @@ impl SearchQueryWithIndex {
|
|||||||
page,
|
page,
|
||||||
hits_per_page,
|
hits_per_page,
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
retrieve_vectors,
|
|
||||||
attributes_to_crop,
|
attributes_to_crop,
|
||||||
crop_length,
|
crop_length,
|
||||||
attributes_to_highlight,
|
attributes_to_highlight,
|
||||||
@@ -439,7 +421,6 @@ impl SearchQueryWithIndex {
|
|||||||
show_matches_position,
|
show_matches_position,
|
||||||
filter,
|
filter,
|
||||||
sort,
|
sort,
|
||||||
distinct,
|
|
||||||
facets,
|
facets,
|
||||||
highlight_pre_tag,
|
highlight_pre_tag,
|
||||||
highlight_post_tag,
|
highlight_post_tag,
|
||||||
@@ -459,7 +440,6 @@ impl SearchQueryWithIndex {
|
|||||||
page,
|
page,
|
||||||
hits_per_page,
|
hits_per_page,
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
retrieve_vectors,
|
|
||||||
attributes_to_crop,
|
attributes_to_crop,
|
||||||
crop_length,
|
crop_length,
|
||||||
attributes_to_highlight,
|
attributes_to_highlight,
|
||||||
@@ -468,7 +448,6 @@ impl SearchQueryWithIndex {
|
|||||||
show_matches_position,
|
show_matches_position,
|
||||||
filter,
|
filter,
|
||||||
sort,
|
sort,
|
||||||
distinct,
|
|
||||||
facets,
|
facets,
|
||||||
highlight_pre_tag,
|
highlight_pre_tag,
|
||||||
highlight_post_tag,
|
highlight_post_tag,
|
||||||
@@ -499,8 +478,6 @@ pub struct SimilarQuery {
|
|||||||
pub embedder: Option<String>,
|
pub embedder: Option<String>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)]
|
#[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)]
|
||||||
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarRetrieveVectors>)]
|
|
||||||
pub retrieve_vectors: bool,
|
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)]
|
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)]
|
||||||
pub show_ranking_score: bool,
|
pub show_ranking_score: bool,
|
||||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)]
|
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)]
|
||||||
@@ -739,10 +716,6 @@ fn prepare_search<'t>(
|
|||||||
search.ranking_score_threshold(ranking_score_threshold.0);
|
search.ranking_score_threshold(ranking_score_threshold.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(distinct) = &query.distinct {
|
|
||||||
search.distinct(distinct.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
match search_kind {
|
match search_kind {
|
||||||
SearchKind::KeywordOnly => {
|
SearchKind::KeywordOnly => {
|
||||||
if let Some(q) = &query.q {
|
if let Some(q) = &query.q {
|
||||||
@@ -837,7 +810,6 @@ pub fn perform_search(
|
|||||||
index: &Index,
|
index: &Index,
|
||||||
query: SearchQuery,
|
query: SearchQuery,
|
||||||
search_kind: SearchKind,
|
search_kind: SearchKind,
|
||||||
retrieve_vectors: RetrieveVectors,
|
|
||||||
) -> Result<SearchResult, MeilisearchHttpError> {
|
) -> Result<SearchResult, MeilisearchHttpError> {
|
||||||
let before_search = Instant::now();
|
let before_search = Instant::now();
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
@@ -875,8 +847,6 @@ pub fn perform_search(
|
|||||||
page,
|
page,
|
||||||
hits_per_page,
|
hits_per_page,
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
// use the enum passed as parameter
|
|
||||||
retrieve_vectors: _,
|
|
||||||
attributes_to_crop,
|
attributes_to_crop,
|
||||||
crop_length,
|
crop_length,
|
||||||
attributes_to_highlight,
|
attributes_to_highlight,
|
||||||
@@ -896,12 +866,10 @@ pub fn perform_search(
|
|||||||
matching_strategy: _,
|
matching_strategy: _,
|
||||||
attributes_to_search_on: _,
|
attributes_to_search_on: _,
|
||||||
filter: _,
|
filter: _,
|
||||||
distinct: _,
|
|
||||||
} = query;
|
} = query;
|
||||||
|
|
||||||
let format = AttributesFormat {
|
let format = AttributesFormat {
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
retrieve_vectors,
|
|
||||||
attributes_to_highlight,
|
attributes_to_highlight,
|
||||||
attributes_to_crop,
|
attributes_to_crop,
|
||||||
crop_length,
|
crop_length,
|
||||||
@@ -985,7 +953,6 @@ pub fn perform_search(
|
|||||||
|
|
||||||
struct AttributesFormat {
|
struct AttributesFormat {
|
||||||
attributes_to_retrieve: Option<BTreeSet<String>>,
|
attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||||
retrieve_vectors: RetrieveVectors,
|
|
||||||
attributes_to_highlight: Option<HashSet<String>>,
|
attributes_to_highlight: Option<HashSet<String>>,
|
||||||
attributes_to_crop: Option<Vec<String>>,
|
attributes_to_crop: Option<Vec<String>>,
|
||||||
crop_length: usize,
|
crop_length: usize,
|
||||||
@@ -998,36 +965,6 @@ struct AttributesFormat {
|
|||||||
show_ranking_score_details: bool,
|
show_ranking_score_details: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
||||||
pub enum RetrieveVectors {
|
|
||||||
/// Do not touch the `_vectors` field
|
|
||||||
///
|
|
||||||
/// this is the behavior when the vectorStore feature is disabled
|
|
||||||
Ignore,
|
|
||||||
/// Remove the `_vectors` field
|
|
||||||
///
|
|
||||||
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false`
|
|
||||||
Hide,
|
|
||||||
/// Retrieve vectors from the DB and merge them into the `_vectors` field
|
|
||||||
///
|
|
||||||
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true`
|
|
||||||
Retrieve,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RetrieveVectors {
|
|
||||||
pub fn new(
|
|
||||||
retrieve_vector: bool,
|
|
||||||
features: index_scheduler::RoFeatures,
|
|
||||||
) -> Result<Self, index_scheduler::Error> {
|
|
||||||
match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) {
|
|
||||||
(true, Ok(())) => Ok(Self::Retrieve),
|
|
||||||
(true, Err(error)) => Err(error),
|
|
||||||
(false, Ok(())) => Ok(Self::Hide),
|
|
||||||
(false, Err(_)) => Ok(Self::Ignore),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn make_hits(
|
fn make_hits(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
rtxn: &RoTxn<'_>,
|
rtxn: &RoTxn<'_>,
|
||||||
@@ -1037,32 +974,10 @@ fn make_hits(
|
|||||||
document_scores: Vec<Vec<ScoreDetails>>,
|
document_scores: Vec<Vec<ScoreDetails>>,
|
||||||
) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
|
) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
|
||||||
let fields_ids_map = index.fields_ids_map(rtxn).unwrap();
|
let fields_ids_map = index.fields_ids_map(rtxn).unwrap();
|
||||||
let displayed_ids =
|
let displayed_ids = index
|
||||||
index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::<BTreeSet<_>>());
|
.displayed_fields_ids(rtxn)?
|
||||||
|
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>())
|
||||||
let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
|
||||||
|
|
||||||
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
|
|
||||||
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
|
|
||||||
(None, _) => false,
|
|
||||||
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
|
|
||||||
(Some(_), None) => true,
|
|
||||||
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
|
||||||
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
|
||||||
};
|
|
||||||
|
|
||||||
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
|
|
||||||
if vectors_is_hidden {
|
|
||||||
RetrieveVectors::Hide
|
|
||||||
} else {
|
|
||||||
RetrieveVectors::Retrieve
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
format.retrieve_vectors
|
|
||||||
};
|
|
||||||
|
|
||||||
let displayed_ids =
|
|
||||||
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
|
|
||||||
let fids = |attrs: &BTreeSet<String>| {
|
let fids = |attrs: &BTreeSet<String>| {
|
||||||
let mut ids = BTreeSet::new();
|
let mut ids = BTreeSet::new();
|
||||||
for attr in attrs {
|
for attr in attrs {
|
||||||
@@ -1085,7 +1000,6 @@ fn make_hits(
|
|||||||
.intersection(&displayed_ids)
|
.intersection(&displayed_ids)
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
|
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
|
||||||
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
|
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
|
||||||
let formatted_options = compute_formatted_options(
|
let formatted_options = compute_formatted_options(
|
||||||
@@ -1119,48 +1033,18 @@ fn make_hits(
|
|||||||
formatter_builder.highlight_prefix(format.highlight_pre_tag);
|
formatter_builder.highlight_prefix(format.highlight_pre_tag);
|
||||||
formatter_builder.highlight_suffix(format.highlight_post_tag);
|
formatter_builder.highlight_suffix(format.highlight_post_tag);
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
let embedding_configs = index.embedding_configs(rtxn)?;
|
|
||||||
let documents_iter = index.documents(rtxn, documents_ids)?;
|
let documents_iter = index.documents(rtxn, documents_ids)?;
|
||||||
for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
|
for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
|
||||||
// First generate a document with all the displayed fields
|
// First generate a document with all the displayed fields
|
||||||
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
|
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
|
||||||
|
|
||||||
let add_vectors_fid =
|
|
||||||
vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve);
|
|
||||||
|
|
||||||
// select the attributes to retrieve
|
// select the attributes to retrieve
|
||||||
let attributes_to_retrieve = to_retrieve_ids
|
let attributes_to_retrieve = to_retrieve_ids
|
||||||
.iter()
|
.iter()
|
||||||
// skip the vectors_fid if RetrieveVectors::Hide
|
|
||||||
.filter(|fid| match vectors_fid {
|
|
||||||
Some(vectors_fid) => {
|
|
||||||
!(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
|
|
||||||
}
|
|
||||||
None => true,
|
|
||||||
})
|
|
||||||
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
|
|
||||||
.chain(add_vectors_fid.iter())
|
|
||||||
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
|
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
|
||||||
let mut document =
|
let mut document =
|
||||||
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
|
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
|
||||||
|
|
||||||
if retrieve_vectors == RetrieveVectors::Retrieve {
|
|
||||||
let mut vectors = match document.remove("_vectors") {
|
|
||||||
Some(Value::Object(map)) => map,
|
|
||||||
_ => Default::default(),
|
|
||||||
};
|
|
||||||
for (name, vector) in index.embeddings(rtxn, id)? {
|
|
||||||
let user_provided = embedding_configs
|
|
||||||
.iter()
|
|
||||||
.find(|conf| conf.name == name)
|
|
||||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
|
||||||
let embeddings =
|
|
||||||
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
|
|
||||||
vectors.insert(name, serde_json::to_value(embeddings)?);
|
|
||||||
}
|
|
||||||
document.insert("_vectors".into(), vectors.into());
|
|
||||||
}
|
|
||||||
|
|
||||||
let (matches_position, formatted) = format_fields(
|
let (matches_position, formatted) = format_fields(
|
||||||
&displayed_document,
|
&displayed_document,
|
||||||
&fields_ids_map,
|
&fields_ids_map,
|
||||||
@@ -1230,7 +1114,6 @@ pub fn perform_similar(
|
|||||||
query: SimilarQuery,
|
query: SimilarQuery,
|
||||||
embedder_name: String,
|
embedder_name: String,
|
||||||
embedder: Arc<Embedder>,
|
embedder: Arc<Embedder>,
|
||||||
retrieve_vectors: RetrieveVectors,
|
|
||||||
) -> Result<SimilarResult, ResponseError> {
|
) -> Result<SimilarResult, ResponseError> {
|
||||||
let before_search = Instant::now();
|
let before_search = Instant::now();
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
@@ -1242,7 +1125,6 @@ pub fn perform_similar(
|
|||||||
filter: _,
|
filter: _,
|
||||||
embedder: _,
|
embedder: _,
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
retrieve_vectors: _,
|
|
||||||
show_ranking_score,
|
show_ranking_score,
|
||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
ranking_score_threshold,
|
ranking_score_threshold,
|
||||||
@@ -1289,7 +1171,6 @@ pub fn perform_similar(
|
|||||||
|
|
||||||
let format = AttributesFormat {
|
let format = AttributesFormat {
|
||||||
attributes_to_retrieve,
|
attributes_to_retrieve,
|
||||||
retrieve_vectors,
|
|
||||||
attributes_to_highlight: None,
|
attributes_to_highlight: None,
|
||||||
attributes_to_crop: None,
|
attributes_to_crop: None,
|
||||||
crop_length: DEFAULT_CROP_LENGTH(),
|
crop_length: DEFAULT_CROP_LENGTH(),
|
||||||
|
|||||||
@@ -182,10 +182,14 @@ impl Index<'_> {
|
|||||||
self.service.get(url).await
|
self.service.get(url).await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_document(&self, id: u64, options: Option<Value>) -> (Value, StatusCode) {
|
pub async fn get_document(
|
||||||
|
&self,
|
||||||
|
id: u64,
|
||||||
|
options: Option<GetDocumentOptions>,
|
||||||
|
) -> (Value, StatusCode) {
|
||||||
let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id);
|
let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id);
|
||||||
if let Some(options) = options {
|
if let Some(fields) = options.and_then(|o| o.fields) {
|
||||||
write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap();
|
let _ = write!(url, "?fields={}", fields.join(","));
|
||||||
}
|
}
|
||||||
self.service.get(url).await
|
self.service.get(url).await
|
||||||
}
|
}
|
||||||
@@ -201,11 +205,18 @@ impl Index<'_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) {
|
pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) {
|
||||||
let url = format!(
|
let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref()));
|
||||||
"/indexes/{}/documents?{}",
|
if let Some(limit) = options.limit {
|
||||||
urlencode(self.uid.as_ref()),
|
let _ = write!(url, "limit={}&", limit);
|
||||||
yaup::to_string(&options).unwrap()
|
}
|
||||||
);
|
|
||||||
|
if let Some(offset) = options.offset {
|
||||||
|
let _ = write!(url, "offset={}&", offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(attributes_to_retrieve) = options.attributes_to_retrieve {
|
||||||
|
let _ = write!(url, "fields={}&", attributes_to_retrieve.join(","));
|
||||||
|
}
|
||||||
|
|
||||||
self.service.get(url).await
|
self.service.get(url).await
|
||||||
}
|
}
|
||||||
@@ -424,11 +435,13 @@ impl Index<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default, serde::Serialize)]
|
pub struct GetDocumentOptions {
|
||||||
#[serde(rename_all = "camelCase")]
|
pub fields: Option<Vec<&'static str>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
pub struct GetAllDocumentsOptions {
|
pub struct GetAllDocumentsOptions {
|
||||||
pub limit: Option<usize>,
|
pub limit: Option<usize>,
|
||||||
pub offset: Option<usize>,
|
pub offset: Option<usize>,
|
||||||
pub retrieve_vectors: bool,
|
pub attributes_to_retrieve: Option<Vec<&'static str>>,
|
||||||
pub fields: Option<Vec<&'static str>>,
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ pub mod service;
|
|||||||
use std::fmt::{self, Display};
|
use std::fmt::{self, Display};
|
||||||
|
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
pub use index::GetAllDocumentsOptions;
|
pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
|
||||||
use meili_snap::json_string;
|
use meili_snap::json_string;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
@@ -65,7 +65,7 @@ impl Display for Value {
|
|||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"{}",
|
"{}",
|
||||||
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]", ".processingTimeMs" => "[duration]" })
|
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -795,70 +795,3 @@ async fn fetch_document_by_filter() {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn retrieve_vectors() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("doggo");
|
|
||||||
|
|
||||||
// GET ALL DOCUMENTS BY QUERY
|
|
||||||
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await;
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
|
|
||||||
"code": "invalid_document_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await;
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
|
|
||||||
"code": "feature_not_enabled",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// FETCH ALL DOCUMENTS BY POST
|
|
||||||
let (response, _code) =
|
|
||||||
index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await;
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`",
|
|
||||||
"code": "invalid_document_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await;
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
|
|
||||||
"code": "feature_not_enabled",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// GET A SINGLE DOCUMENT
|
|
||||||
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await;
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
|
|
||||||
"code": "invalid_document_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
|
|
||||||
"code": "feature_not_enabled",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use meili_snap::*;
|
|||||||
use urlencoding::encode as urlencode;
|
use urlencoding::encode as urlencode;
|
||||||
|
|
||||||
use crate::common::encoder::Encoder;
|
use crate::common::encoder::Encoder;
|
||||||
use crate::common::{GetAllDocumentsOptions, Server, Value};
|
use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value};
|
||||||
use crate::json;
|
use crate::json;
|
||||||
|
|
||||||
// TODO: partial test since we are testing error, amd error is not yet fully implemented in
|
// TODO: partial test since we are testing error, amd error is not yet fully implemented in
|
||||||
@@ -59,7 +59,8 @@ async fn get_document() {
|
|||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await;
|
let (response, code) =
|
||||||
|
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response,
|
response,
|
||||||
@@ -68,8 +69,9 @@ async fn get_document() {
|
|||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
let (response, code) =
|
let (response, code) = index
|
||||||
index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await;
|
.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) }))
|
||||||
|
.await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response,
|
response,
|
||||||
@@ -209,7 +211,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
|
|||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.get_all_documents(GetAllDocumentsOptions {
|
.get_all_documents(GetAllDocumentsOptions {
|
||||||
fields: Some(vec!["name"]),
|
attributes_to_retrieve: Some(vec!["name"]),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
@@ -223,19 +225,9 @@ async fn test_get_all_documents_attributes_to_retrieve() {
|
|||||||
assert_eq!(response["limit"], json!(20));
|
assert_eq!(response["limit"], json!(20));
|
||||||
assert_eq!(response["total"], json!(77));
|
assert_eq!(response["total"], json!(77));
|
||||||
|
|
||||||
let (response, code) = index.get_all_documents_raw("?fields=").await;
|
|
||||||
assert_eq!(code, 200);
|
|
||||||
assert_eq!(response["results"].as_array().unwrap().len(), 20);
|
|
||||||
for results in response["results"].as_array().unwrap() {
|
|
||||||
assert_eq!(results.as_object().unwrap().keys().count(), 0);
|
|
||||||
}
|
|
||||||
assert_eq!(response["offset"], json!(0));
|
|
||||||
assert_eq!(response["limit"], json!(20));
|
|
||||||
assert_eq!(response["total"], json!(77));
|
|
||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.get_all_documents(GetAllDocumentsOptions {
|
.get_all_documents(GetAllDocumentsOptions {
|
||||||
fields: Some(vec!["wrong"]),
|
attributes_to_retrieve: Some(vec![]),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
@@ -250,7 +242,22 @@ async fn test_get_all_documents_attributes_to_retrieve() {
|
|||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.get_all_documents(GetAllDocumentsOptions {
|
.get_all_documents(GetAllDocumentsOptions {
|
||||||
fields: Some(vec!["name", "tags"]),
|
attributes_to_retrieve: Some(vec!["wrong"]),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
assert_eq!(code, 200);
|
||||||
|
assert_eq!(response["results"].as_array().unwrap().len(), 20);
|
||||||
|
for results in response["results"].as_array().unwrap() {
|
||||||
|
assert_eq!(results.as_object().unwrap().keys().count(), 0);
|
||||||
|
}
|
||||||
|
assert_eq!(response["offset"], json!(0));
|
||||||
|
assert_eq!(response["limit"], json!(20));
|
||||||
|
assert_eq!(response["total"], json!(77));
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.get_all_documents(GetAllDocumentsOptions {
|
||||||
|
attributes_to_retrieve: Some(vec!["name", "tags"]),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
@@ -263,7 +270,10 @@ async fn test_get_all_documents_attributes_to_retrieve() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() })
|
.get_all_documents(GetAllDocumentsOptions {
|
||||||
|
attributes_to_retrieve: Some(vec!["*"]),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
.await;
|
.await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(response["results"].as_array().unwrap().len(), 20);
|
assert_eq!(response["results"].as_array().unwrap().len(), 20);
|
||||||
@@ -273,7 +283,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
|
|||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.get_all_documents(GetAllDocumentsOptions {
|
.get_all_documents(GetAllDocumentsOptions {
|
||||||
fields: Some(vec!["*", "wrong"]),
|
attributes_to_retrieve: Some(vec!["*", "wrong"]),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
@@ -306,10 +316,12 @@ async fn get_document_s_nested_attributes_to_retrieve() {
|
|||||||
assert_eq!(code, 202);
|
assert_eq!(code, 202);
|
||||||
index.wait_task(1).await;
|
index.wait_task(1).await;
|
||||||
|
|
||||||
let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await;
|
let (response, code) =
|
||||||
|
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(response, json!({}));
|
assert_eq!(response, json!({}));
|
||||||
let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await;
|
let (response, code) =
|
||||||
|
index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response,
|
response,
|
||||||
@@ -321,7 +333,9 @@ async fn get_document_s_nested_attributes_to_retrieve() {
|
|||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await;
|
let (response, code) = index
|
||||||
|
.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
|
||||||
|
.await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response,
|
response,
|
||||||
@@ -329,7 +343,9 @@ async fn get_document_s_nested_attributes_to_retrieve() {
|
|||||||
"content.truc": "foobar",
|
"content.truc": "foobar",
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await;
|
let (response, code) = index
|
||||||
|
.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
|
||||||
|
.await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response,
|
response,
|
||||||
@@ -524,207 +540,3 @@ async fn get_document_by_filter() {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn get_document_with_vectors() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("doggo");
|
|
||||||
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(value, @r###"
|
|
||||||
{
|
|
||||||
"vectorStore": true,
|
|
||||||
"metrics": false,
|
|
||||||
"logsRoute": false
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.update_settings(json!({
|
|
||||||
"embedders": {
|
|
||||||
"manual": {
|
|
||||||
"source": "userProvided",
|
|
||||||
"dimensions": 3,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}))
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
server.wait_task(response.uid()).await;
|
|
||||||
|
|
||||||
let documents = json!([
|
|
||||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
|
|
||||||
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
|
|
||||||
]);
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
index.wait_task(value.uid()).await;
|
|
||||||
|
|
||||||
// by default you shouldn't see the `_vectors` object
|
|
||||||
let (documents, _code) = index.get_all_documents(Default::default()).await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"name": "echo"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 2
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let (documents, _code) = index.get_document(0, None).await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// if we try to retrieve the vectors with the `fields` parameter they
|
|
||||||
// still shouldn't be displayed
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions {
|
|
||||||
fields: Some(vec!["name", "_vectors"]),
|
|
||||||
..Default::default()
|
|
||||||
})
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "kefir"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "echo"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 2
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let (documents, _code) =
|
|
||||||
index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"name": "kefir"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// If we specify the retrieve vectors boolean and nothing else we should get the vectors
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"name": "echo",
|
|
||||||
"_vectors": {}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 2
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions {
|
|
||||||
retrieve_vectors: true,
|
|
||||||
fields: Some(vec!["name"]),
|
|
||||||
..Default::default()
|
|
||||||
})
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "echo",
|
|
||||||
"_vectors": {}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 2
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let (documents, _code) =
|
|
||||||
index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1938,210 +1938,3 @@ async fn import_dump_v6_containing_experimental_features() {
|
|||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// In this test we must generate the dump ourselves to ensure the
|
|
||||||
// `user provided` vectors are well set
|
|
||||||
#[actix_rt::test]
|
|
||||||
#[cfg_attr(target_os = "windows", ignore)]
|
|
||||||
async fn generate_and_import_dump_containing_vectors() {
|
|
||||||
let temp = tempfile::tempdir().unwrap();
|
|
||||||
let mut opt = default_settings(temp.path());
|
|
||||||
let server = Server::new_with_options(opt.clone()).await.unwrap();
|
|
||||||
let (code, _) = server.set_features(json!({"vectorStore": true})).await;
|
|
||||||
snapshot!(code, @r###"
|
|
||||||
{
|
|
||||||
"vectorStore": true,
|
|
||||||
"metrics": false,
|
|
||||||
"logsRoute": false
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
let index = server.index("pets");
|
|
||||||
let (response, code) = index
|
|
||||||
.update_settings(json!(
|
|
||||||
{
|
|
||||||
"embedders": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"source": "huggingFace",
|
|
||||||
"model": "sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
|
|
||||||
"documentTemplate": "{{doc.doggo}}",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
))
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let response = index.wait_task(response.uid()).await;
|
|
||||||
snapshot!(response);
|
|
||||||
let (response, code) = index
|
|
||||||
.add_documents(
|
|
||||||
json!([
|
|
||||||
{"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }},
|
|
||||||
{"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}},
|
|
||||||
{"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}},
|
|
||||||
{"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}},
|
|
||||||
{"id": 4, "doggo": "max" },
|
|
||||||
]),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let response = index.wait_task(response.uid()).await;
|
|
||||||
snapshot!(response);
|
|
||||||
|
|
||||||
let (response, code) = server.create_dump().await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let response = index.wait_task(response.uid()).await;
|
|
||||||
snapshot!(response["status"], @r###""succeeded""###);
|
|
||||||
|
|
||||||
// ========= We made a dump, now we should clear the DB and try to import our dump
|
|
||||||
drop(server);
|
|
||||||
tokio::fs::remove_dir_all(&opt.db_path).await.unwrap();
|
|
||||||
let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap());
|
|
||||||
let dump_path = opt.dump_dir.join(dump_name);
|
|
||||||
assert!(dump_path.exists(), "path: `{}`", dump_path.display());
|
|
||||||
|
|
||||||
opt.import_dump = Some(dump_path);
|
|
||||||
// NOTE: We shouldn't have to change the database path but I lost one hour
|
|
||||||
// because of a « bad path » error and that fixed it.
|
|
||||||
opt.db_path = temp.path().join("data.ms");
|
|
||||||
|
|
||||||
let mut server = Server::new_auth_with_options(opt, temp).await;
|
|
||||||
server.use_api_key("MASTER_KEY");
|
|
||||||
|
|
||||||
let (indexes, code) = server.list_indexes(None, None).await;
|
|
||||||
assert_eq!(code, 200, "{indexes}");
|
|
||||||
|
|
||||||
snapshot!(indexes["results"].as_array().unwrap().len(), @"1");
|
|
||||||
snapshot!(indexes["results"][0]["uid"], @r###""pets""###);
|
|
||||||
snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###);
|
|
||||||
|
|
||||||
let (response, code) = server.get_features().await;
|
|
||||||
meili_snap::snapshot!(code, @"200 OK");
|
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"vectorStore": true,
|
|
||||||
"metrics": false,
|
|
||||||
"logsRoute": false
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let index = server.index("pets");
|
|
||||||
|
|
||||||
let (response, code) = index.settings().await;
|
|
||||||
meili_snap::snapshot!(code, @"200 OK");
|
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"displayedAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"searchableAttributes": [
|
|
||||||
"*"
|
|
||||||
],
|
|
||||||
"filterableAttributes": [],
|
|
||||||
"sortableAttributes": [],
|
|
||||||
"rankingRules": [
|
|
||||||
"words",
|
|
||||||
"typo",
|
|
||||||
"proximity",
|
|
||||||
"attribute",
|
|
||||||
"sort",
|
|
||||||
"exactness"
|
|
||||||
],
|
|
||||||
"stopWords": [],
|
|
||||||
"nonSeparatorTokens": [],
|
|
||||||
"separatorTokens": [],
|
|
||||||
"dictionary": [],
|
|
||||||
"synonyms": {},
|
|
||||||
"distinctAttribute": null,
|
|
||||||
"proximityPrecision": "byWord",
|
|
||||||
"typoTolerance": {
|
|
||||||
"enabled": true,
|
|
||||||
"minWordSizeForTypos": {
|
|
||||||
"oneTypo": 5,
|
|
||||||
"twoTypos": 9
|
|
||||||
},
|
|
||||||
"disableOnWords": [],
|
|
||||||
"disableOnAttributes": []
|
|
||||||
},
|
|
||||||
"faceting": {
|
|
||||||
"maxValuesPerFacet": 100,
|
|
||||||
"sortFacetValuesBy": {
|
|
||||||
"*": "alpha"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"pagination": {
|
|
||||||
"maxTotalHits": 1000
|
|
||||||
},
|
|
||||||
"embedders": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"source": "huggingFace",
|
|
||||||
"model": "sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
|
|
||||||
"documentTemplate": "{{doc.doggo}}"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"searchCutoffMs": null
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
index
|
|
||||||
.search(json!({"retrieveVectors": true}), |response, code| {
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###"
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"doggo": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"embeddings": "[vector]",
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"doggo": "echo",
|
|
||||||
"_vectors": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"embeddings": "[vector]",
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"doggo": "intel",
|
|
||||||
"_vectors": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"embeddings": "[vector]",
|
|
||||||
"regenerate": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"doggo": "bill",
|
|
||||||
"_vectors": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"embeddings": "[vector]",
|
|
||||||
"regenerate": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"doggo": "max",
|
|
||||||
"_vectors": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"embeddings": "[vector]",
|
|
||||||
"regenerate": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
})
|
|
||||||
.await;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,25 +0,0 @@
|
|||||||
---
|
|
||||||
source: meilisearch/tests/dumps/mod.rs
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"uid": 0,
|
|
||||||
"indexUid": "pets",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "settingsUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"embedders": {
|
|
||||||
"doggo_embedder": {
|
|
||||||
"source": "huggingFace",
|
|
||||||
"model": "sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
|
|
||||||
"documentTemplate": "{{doc.doggo}}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
---
|
|
||||||
source: meilisearch/tests/dumps/mod.rs
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"uid": 1,
|
|
||||||
"indexUid": "pets",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 5,
|
|
||||||
"indexedDocuments": 5
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
@@ -13,7 +13,6 @@ mod snapshot;
|
|||||||
mod stats;
|
mod stats;
|
||||||
mod swap_indexes;
|
mod swap_indexes;
|
||||||
mod tasks;
|
mod tasks;
|
||||||
mod vector;
|
|
||||||
|
|
||||||
// Tests are isolated by features in different modules to allow better readability, test
|
// Tests are isolated by features in different modules to allow better readability, test
|
||||||
// targetability, and improved incremental compilation times.
|
// targetability, and improved incremental compilation times.
|
||||||
|
|||||||
@@ -107,39 +107,6 @@ static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
|||||||
])
|
])
|
||||||
});
|
});
|
||||||
|
|
||||||
static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
|
||||||
json!([
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"description": "Leather Jacket",
|
|
||||||
"brand": "Lee Jeans",
|
|
||||||
"product_id": "123456",
|
|
||||||
"color": { "main": "Brown", "pattern": "stripped" },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"description": "Leather Jacket",
|
|
||||||
"brand": "Lee Jeans",
|
|
||||||
"product_id": "123456",
|
|
||||||
"color": { "main": "Black", "pattern": "stripped" },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"description": "Leather Jacket",
|
|
||||||
"brand": "Lee Jeans",
|
|
||||||
"product_id": "123456",
|
|
||||||
"color": { "main": "Blue", "pattern": "used" },
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"description": "T-Shirt",
|
|
||||||
"brand": "Nike",
|
|
||||||
"product_id": "789012",
|
|
||||||
"color": { "main": "Blue", "pattern": "stripped" },
|
|
||||||
}
|
|
||||||
])
|
|
||||||
});
|
|
||||||
|
|
||||||
static DOCUMENT_PRIMARY_KEY: &str = "id";
|
static DOCUMENT_PRIMARY_KEY: &str = "id";
|
||||||
static DOCUMENT_DISTINCT_KEY: &str = "product_id";
|
static DOCUMENT_DISTINCT_KEY: &str = "product_id";
|
||||||
|
|
||||||
@@ -272,35 +239,3 @@ async fn distinct_search_with_pagination_no_ranking() {
|
|||||||
snapshot!(response["totalPages"], @"2");
|
snapshot!(response["totalPages"], @"2");
|
||||||
snapshot!(response["totalHits"], @"6");
|
snapshot!(response["totalHits"], @"6");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn distinct_at_search_time() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("tamo");
|
|
||||||
|
|
||||||
let documents = NESTED_DOCUMENTS.clone();
|
|
||||||
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
|
|
||||||
let (task, _) = index.update_settings_filterable_attributes(json!(["color.main"])).await;
|
|
||||||
let task = index.wait_task(task.uid()).await;
|
|
||||||
snapshot!(task, name: "succeed");
|
|
||||||
|
|
||||||
fn get_hits(response: &Value) -> Vec<String> {
|
|
||||||
let hits_array = response["hits"]
|
|
||||||
.as_array()
|
|
||||||
.unwrap_or_else(|| panic!("{}", &serde_json::to_string_pretty(&response).unwrap()));
|
|
||||||
hits_array
|
|
||||||
.iter()
|
|
||||||
.map(|h| h[DOCUMENT_PRIMARY_KEY].as_number().unwrap().to_string())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
}
|
|
||||||
|
|
||||||
let (response, code) =
|
|
||||||
index.search_post(json!({"page": 1, "hitsPerPage": 3, "distinct": "color.main"})).await;
|
|
||||||
let hits = get_hits(&response);
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(hits.len(), @"3");
|
|
||||||
snapshot!(format!("{:?}", hits), @r###"["1", "2", "3"]"###);
|
|
||||||
snapshot!(response["page"], @"1");
|
|
||||||
snapshot!(response["totalPages"], @"1");
|
|
||||||
snapshot!(response["totalHits"], @"3");
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -167,74 +167,6 @@ async fn search_bad_hits_per_page() {
|
|||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn search_bad_attributes_to_retrieve() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("test");
|
|
||||||
|
|
||||||
let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`",
|
|
||||||
"code": "invalid_search_attributes_to_retrieve",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
// Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings.
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn search_bad_retrieve_vectors() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("test");
|
|
||||||
|
|
||||||
let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
|
|
||||||
"code": "invalid_search_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
|
|
||||||
"code": "invalid_search_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index.search_get("retrieveVectors=").await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
|
|
||||||
"code": "invalid_search_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index.search_get("retrieveVectors=doggo").await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
|
|
||||||
"code": "invalid_search_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn search_bad_attributes_to_crop() {
|
async fn search_bad_attributes_to_crop() {
|
||||||
let server = Server::new().await;
|
let server = Server::new().await;
|
||||||
@@ -1140,66 +1072,3 @@ async fn search_on_unknown_field_plus_joker() {
|
|||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn distinct_at_search_time() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("tamo");
|
|
||||||
let (task, _) = index.create(None).await;
|
|
||||||
let task = index.wait_task(task.uid()).await;
|
|
||||||
snapshot!(task, name: "task-succeed");
|
|
||||||
|
|
||||||
let (response, code) =
|
|
||||||
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(response, @r###"
|
|
||||||
{
|
|
||||||
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.",
|
|
||||||
"code": "invalid_search_distinct",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await;
|
|
||||||
index.wait_task(task.uid()).await;
|
|
||||||
|
|
||||||
let (response, code) =
|
|
||||||
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(response, @r###"
|
|
||||||
{
|
|
||||||
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.",
|
|
||||||
"code": "invalid_search_distinct",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await;
|
|
||||||
index.wait_task(task.uid()).await;
|
|
||||||
|
|
||||||
let (response, code) =
|
|
||||||
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(response, @r###"
|
|
||||||
{
|
|
||||||
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.",
|
|
||||||
"code": "invalid_search_distinct",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) =
|
|
||||||
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(response, @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value type at `.distinct`: expected a string, but found a boolean: `true`",
|
|
||||||
"code": "invalid_search_distinct",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -124,61 +124,32 @@ async fn simple_search() {
|
|||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(
|
.search_post(
|
||||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
|
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"0");
|
snapshot!(response["semanticHitCount"], @"0");
|
||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(
|
.search_post(
|
||||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}),
|
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"2");
|
snapshot!(response["semanticHitCount"], @"2");
|
||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(
|
.search_post(
|
||||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}),
|
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"3");
|
snapshot!(response["semanticHitCount"], @"3");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn limit_offset() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.search_post(
|
|
||||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true, "offset": 1, "limit": 1}),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###);
|
|
||||||
snapshot!(response["semanticHitCount"], @"0");
|
|
||||||
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
|
|
||||||
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.search_post(
|
|
||||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.9}, "retrieveVectors": true, "offset": 1, "limit": 1}),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}}]"###);
|
|
||||||
snapshot!(response["semanticHitCount"], @"1");
|
|
||||||
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn simple_search_hf() {
|
async fn simple_search_hf() {
|
||||||
let server = Server::new().await;
|
let server = Server::new().await;
|
||||||
@@ -233,10 +204,10 @@ async fn distribution_shift() {
|
|||||||
let server = Server::new().await;
|
let server = Server::new().await;
|
||||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||||
|
|
||||||
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true});
|
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}});
|
||||||
let (response, code) = index.search_post(search.clone()).await;
|
let (response, code) = index.search_post(search.clone()).await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
|
||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.update_settings(json!({
|
.update_settings(json!({
|
||||||
@@ -257,7 +228,7 @@ async fn distribution_shift() {
|
|||||||
|
|
||||||
let (response, code) = index.search_post(search).await;
|
let (response, code) = index.search_post(search).await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
@@ -268,23 +239,20 @@ async fn highlighter() {
|
|||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||||
"hybrid": {"semanticRatio": 0.2},
|
"hybrid": {"semanticRatio": 0.2},
|
||||||
"retrieveVectors": true,
|
"attributesToHighlight": [
|
||||||
"attributesToHighlight": [
|
"desc"
|
||||||
"desc",
|
|
||||||
"_vectors",
|
|
||||||
],
|
],
|
||||||
"highlightPreTag": "**BEGIN**",
|
"highlightPreTag": "**BEGIN**",
|
||||||
"highlightPostTag": "**END**",
|
"highlightPostTag": "**END**"
|
||||||
}))
|
}))
|
||||||
.await;
|
.await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"0");
|
snapshot!(response["semanticHitCount"], @"0");
|
||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||||
"hybrid": {"semanticRatio": 0.8},
|
"hybrid": {"semanticRatio": 0.8},
|
||||||
"retrieveVectors": true,
|
|
||||||
"showRankingScore": true,
|
"showRankingScore": true,
|
||||||
"attributesToHighlight": [
|
"attributesToHighlight": [
|
||||||
"desc"
|
"desc"
|
||||||
@@ -294,14 +262,13 @@ async fn highlighter() {
|
|||||||
}))
|
}))
|
||||||
.await;
|
.await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"3");
|
snapshot!(response["semanticHitCount"], @"3");
|
||||||
|
|
||||||
// no highlighting on full semantic
|
// no highlighting on full semantic
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||||
"hybrid": {"semanticRatio": 1.0},
|
"hybrid": {"semanticRatio": 1.0},
|
||||||
"retrieveVectors": true,
|
|
||||||
"showRankingScore": true,
|
"showRankingScore": true,
|
||||||
"attributesToHighlight": [
|
"attributesToHighlight": [
|
||||||
"desc"
|
"desc"
|
||||||
@@ -311,7 +278,7 @@ async fn highlighter() {
|
|||||||
}))
|
}))
|
||||||
.await;
|
.await;
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"3");
|
snapshot!(response["semanticHitCount"], @"3");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -394,12 +361,12 @@ async fn single_document() {
|
|||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(
|
.search_post(
|
||||||
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
|
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###);
|
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###);
|
||||||
snapshot!(response["semanticHitCount"], @"1");
|
snapshot!(response["semanticHitCount"], @"1");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -410,25 +377,25 @@ async fn query_combination() {
|
|||||||
|
|
||||||
// search without query and vector, but with hybrid => still placeholder
|
// search without query and vector, but with hybrid => still placeholder
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
|
.search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"null");
|
snapshot!(response["semanticHitCount"], @"null");
|
||||||
|
|
||||||
// same with a different semantic ratio
|
// same with a different semantic ratio
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true}))
|
.search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"null");
|
snapshot!(response["semanticHitCount"], @"null");
|
||||||
|
|
||||||
// wrong vector dimensions
|
// wrong vector dimensions
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
|
.search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
snapshot!(code, @"400 Bad Request");
|
||||||
@@ -443,34 +410,34 @@ async fn query_combination() {
|
|||||||
|
|
||||||
// full vector
|
// full vector
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
|
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"3");
|
snapshot!(response["semanticHitCount"], @"3");
|
||||||
|
|
||||||
// full keyword, without a query
|
// full keyword, without a query
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
|
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"null");
|
snapshot!(response["semanticHitCount"], @"null");
|
||||||
|
|
||||||
// query + vector, full keyword => keyword
|
// query + vector, full keyword => keyword
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
|
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"null");
|
snapshot!(response["semanticHitCount"], @"null");
|
||||||
|
|
||||||
// query + vector, no hybrid keyword =>
|
// query + vector, no hybrid keyword =>
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true}))
|
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
snapshot!(code, @"400 Bad Request");
|
||||||
@@ -486,7 +453,7 @@ async fn query_combination() {
|
|||||||
// full vector, without a vector => error
|
// full vector, without a vector => error
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(
|
.search_post(
|
||||||
json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
|
json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -503,93 +470,11 @@ async fn query_combination() {
|
|||||||
// hybrid without a vector => full keyword
|
// hybrid without a vector => full keyword
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.search_post(
|
.search_post(
|
||||||
json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}),
|
json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###);
|
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###);
|
||||||
snapshot!(response["semanticHitCount"], @"0");
|
snapshot!(response["semanticHitCount"], @"0");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn retrieve_vectors() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.search_post(
|
|
||||||
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"title": "Captain Planet",
|
|
||||||
"desc": "He's not part of the Marvel Cinematic Universe",
|
|
||||||
"id": "2",
|
|
||||||
"_vectors": {
|
|
||||||
"default": {
|
|
||||||
"embeddings": "[vectors]",
|
|
||||||
"regenerate": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Captain Marvel",
|
|
||||||
"desc": "a Shazam ersatz",
|
|
||||||
"id": "3",
|
|
||||||
"_vectors": {
|
|
||||||
"default": {
|
|
||||||
"embeddings": "[vectors]",
|
|
||||||
"regenerate": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Shazam!",
|
|
||||||
"desc": "a Captain Marvel ersatz",
|
|
||||||
"id": "1",
|
|
||||||
"_vectors": {
|
|
||||||
"default": {
|
|
||||||
"embeddings": "[vectors]",
|
|
||||||
"regenerate": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// remove `_vectors` from displayed attributes
|
|
||||||
let (response, code) =
|
|
||||||
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;
|
|
||||||
assert_eq!(202, code, "{:?}", response);
|
|
||||||
index.wait_task(response.uid()).await;
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.search_post(
|
|
||||||
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"title": "Captain Planet",
|
|
||||||
"desc": "He's not part of the Marvel Cinematic Universe",
|
|
||||||
"id": "2"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Captain Marvel",
|
|
||||||
"desc": "a Shazam ersatz",
|
|
||||||
"id": "3"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Shazam!",
|
|
||||||
"desc": "a Captain Marvel ersatz",
|
|
||||||
"id": "1"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -301,7 +301,7 @@ async fn negative_special_cases_search() {
|
|||||||
index.add_documents(documents, None).await;
|
index.add_documents(documents, None).await;
|
||||||
index.wait_task(0).await;
|
index.wait_task(0).await;
|
||||||
|
|
||||||
index.update_settings(json!({"synonyms": { "escape": ["gläss"] }})).await;
|
index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await;
|
||||||
index.wait_task(1).await;
|
index.wait_task(1).await;
|
||||||
|
|
||||||
// There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass
|
// There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass
|
||||||
@@ -1290,38 +1290,21 @@ async fn experimental_feature_vector_store() {
|
|||||||
index.add_documents(json!(documents), None).await;
|
index.add_documents(json!(documents), None).await;
|
||||||
index.wait_task(0).await;
|
index.wait_task(0).await;
|
||||||
|
|
||||||
index
|
let (response, code) = index
|
||||||
.search(json!({
|
.search_post(json!({
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"showRankingScore": true
|
"showRankingScore": true
|
||||||
}), |response, code|{
|
}))
|
||||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
|
|
||||||
"code": "feature_not_enabled",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
})
|
|
||||||
.await;
|
|
||||||
index
|
|
||||||
.search(json!({
|
|
||||||
"retrieveVectors": true,
|
|
||||||
"showRankingScore": true
|
|
||||||
}), |response, code|{
|
|
||||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
|
|
||||||
"code": "feature_not_enabled",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
})
|
|
||||||
.await;
|
.await;
|
||||||
|
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||||
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
|
{
|
||||||
|
"message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
|
||||||
|
"code": "feature_not_enabled",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
|
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
|
||||||
meili_snap::snapshot!(code, @"200 OK");
|
meili_snap::snapshot!(code, @"200 OK");
|
||||||
@@ -1354,7 +1337,6 @@ async fn experimental_feature_vector_store() {
|
|||||||
.search_post(json!({
|
.search_post(json!({
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"showRankingScore": true,
|
"showRankingScore": true,
|
||||||
"retrieveVectors": true,
|
|
||||||
}))
|
}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -1366,16 +1348,11 @@ async fn experimental_feature_vector_store() {
|
|||||||
"title": "Shazam!",
|
"title": "Shazam!",
|
||||||
"id": "287947",
|
"id": "287947",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
1.0,
|
||||||
[
|
2.0,
|
||||||
1.0,
|
3.0
|
||||||
2.0,
|
]
|
||||||
3.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 1.0
|
"_rankingScore": 1.0
|
||||||
},
|
},
|
||||||
@@ -1383,16 +1360,11 @@ async fn experimental_feature_vector_store() {
|
|||||||
"title": "Captain Marvel",
|
"title": "Captain Marvel",
|
||||||
"id": "299537",
|
"id": "299537",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
1.0,
|
||||||
[
|
2.0,
|
||||||
1.0,
|
54.0
|
||||||
2.0,
|
]
|
||||||
54.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.9129111766815186
|
"_rankingScore": 0.9129111766815186
|
||||||
},
|
},
|
||||||
@@ -1400,16 +1372,11 @@ async fn experimental_feature_vector_store() {
|
|||||||
"title": "Gläss",
|
"title": "Gläss",
|
||||||
"id": "450465",
|
"id": "450465",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
-100.0,
|
||||||
[
|
340.0,
|
||||||
-100.0,
|
90.0
|
||||||
340.0,
|
]
|
||||||
90.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.8106412887573242
|
"_rankingScore": 0.8106412887573242
|
||||||
},
|
},
|
||||||
@@ -1417,16 +1384,11 @@ async fn experimental_feature_vector_store() {
|
|||||||
"title": "How to Train Your Dragon: The Hidden World",
|
"title": "How to Train Your Dragon: The Hidden World",
|
||||||
"id": "166428",
|
"id": "166428",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
-100.0,
|
||||||
[
|
231.0,
|
||||||
-100.0,
|
32.0
|
||||||
231.0,
|
]
|
||||||
32.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.7412010431289673
|
"_rankingScore": 0.7412010431289673
|
||||||
},
|
},
|
||||||
@@ -1434,16 +1396,11 @@ async fn experimental_feature_vector_store() {
|
|||||||
"title": "Escape Room",
|
"title": "Escape Room",
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
10.0,
|
||||||
[
|
-23.0,
|
||||||
10.0,
|
32.0
|
||||||
-23.0,
|
]
|
||||||
32.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.6972063183784485
|
"_rankingScore": 0.6972063183784485
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,20 +0,0 @@
|
|||||||
---
|
|
||||||
source: meilisearch/tests/search/distinct.rs
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"uid": 1,
|
|
||||||
"indexUid": "tamo",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "settingsUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"filterableAttributes": [
|
|
||||||
"color.main"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
---
|
|
||||||
source: meilisearch/tests/search/errors.rs
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"uid": 0,
|
|
||||||
"indexUid": "tamo",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "indexCreation",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"primaryKey": null
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
@@ -756,54 +756,3 @@ async fn filter_reserved_geo_point_string() {
|
|||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn similar_bad_retrieve_vectors() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
server.set_features(json!({"vectorStore": true})).await;
|
|
||||||
let index = server.index("test");
|
|
||||||
|
|
||||||
let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
|
|
||||||
"code": "invalid_similar_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
|
|
||||||
"code": "invalid_similar_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index.similar_get("retrieveVectors=").await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
|
|
||||||
"code": "invalid_similar_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index.similar_get("retrieveVectors=doggo").await;
|
|
||||||
snapshot!(code, @"400 Bad Request");
|
|
||||||
snapshot!(json_string!(response), @r###"
|
|
||||||
{
|
|
||||||
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
|
|
||||||
"code": "invalid_similar_retrieve_vectors",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ async fn basic() {
|
|||||||
index.wait_task(value.uid()).await;
|
index.wait_task(value.uid()).await;
|
||||||
|
|
||||||
index
|
index
|
||||||
.similar(json!({"id": 143, "retrieveVectors": true}), |response, code| {
|
.similar(json!({"id": 143}), |response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(json_string!(response["hits"]), @r###"
|
snapshot!(json_string!(response["hits"]), @r###"
|
||||||
[
|
[
|
||||||
@@ -87,16 +87,11 @@ async fn basic() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.1,
|
||||||
[
|
0.6,
|
||||||
0.10000000149011612,
|
0.8
|
||||||
0.6000000238418579,
|
]
|
||||||
0.800000011920929
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -104,16 +99,11 @@ async fn basic() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "299537",
|
"id": "299537",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.6,
|
||||||
[
|
0.8,
|
||||||
0.6000000238418579,
|
-0.2
|
||||||
0.800000011920929,
|
]
|
||||||
-0.20000000298023224
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -121,16 +111,11 @@ async fn basic() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "166428",
|
"id": "166428",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.7,
|
||||||
[
|
0.7,
|
||||||
0.699999988079071,
|
-0.4
|
||||||
0.699999988079071,
|
]
|
||||||
-0.4000000059604645
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -138,16 +123,11 @@ async fn basic() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "287947",
|
"id": "287947",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.8,
|
||||||
[
|
0.4,
|
||||||
0.800000011920929,
|
-0.5
|
||||||
0.4000000059604645,
|
]
|
||||||
-0.5
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -156,7 +136,7 @@ async fn basic() {
|
|||||||
.await;
|
.await;
|
||||||
|
|
||||||
index
|
index
|
||||||
.similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| {
|
.similar(json!({"id": "299537"}), |response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(json_string!(response["hits"]), @r###"
|
snapshot!(json_string!(response["hits"]), @r###"
|
||||||
[
|
[
|
||||||
@@ -165,16 +145,11 @@ async fn basic() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "166428",
|
"id": "166428",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.7,
|
||||||
[
|
0.7,
|
||||||
0.699999988079071,
|
-0.4
|
||||||
0.699999988079071,
|
]
|
||||||
-0.4000000059604645
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -182,16 +157,11 @@ async fn basic() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "287947",
|
"id": "287947",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.8,
|
||||||
[
|
0.4,
|
||||||
0.800000011920929,
|
-0.5
|
||||||
0.4000000059604645,
|
]
|
||||||
-0.5
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -199,16 +169,11 @@ async fn basic() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.1,
|
||||||
[
|
0.6,
|
||||||
0.10000000149011612,
|
0.8
|
||||||
0.6000000238418579,
|
]
|
||||||
0.800000011920929
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -216,16 +181,11 @@ async fn basic() {
|
|||||||
"release_year": 1930,
|
"release_year": 1930,
|
||||||
"id": "143",
|
"id": "143",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
-0.5,
|
||||||
[
|
0.3,
|
||||||
-0.5,
|
0.85
|
||||||
0.30000001192092896,
|
]
|
||||||
0.8500000238418579
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -268,7 +228,7 @@ async fn ranking_score_threshold() {
|
|||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(
|
||||||
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}),
|
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0}),
|
||||||
|response, code| {
|
|response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4");
|
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4");
|
||||||
@@ -279,16 +239,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.1,
|
||||||
[
|
0.6,
|
||||||
0.10000000149011612,
|
0.8
|
||||||
0.6000000238418579,
|
]
|
||||||
0.800000011920929
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.890957772731781
|
"_rankingScore": 0.890957772731781
|
||||||
},
|
},
|
||||||
@@ -297,16 +252,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "299537",
|
"id": "299537",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.6,
|
||||||
[
|
0.8,
|
||||||
0.6000000238418579,
|
-0.2
|
||||||
0.800000011920929,
|
]
|
||||||
-0.20000000298023224
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.39060014486312866
|
"_rankingScore": 0.39060014486312866
|
||||||
},
|
},
|
||||||
@@ -315,16 +265,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "166428",
|
"id": "166428",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.7,
|
||||||
[
|
0.7,
|
||||||
0.699999988079071,
|
-0.4
|
||||||
0.699999988079071,
|
]
|
||||||
-0.4000000059604645
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.2819308042526245
|
"_rankingScore": 0.2819308042526245
|
||||||
},
|
},
|
||||||
@@ -333,16 +278,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "287947",
|
"id": "287947",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.8,
|
||||||
[
|
0.4,
|
||||||
0.800000011920929,
|
-0.5
|
||||||
0.4000000059604645,
|
]
|
||||||
-0.5
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.1662663221359253
|
"_rankingScore": 0.1662663221359253
|
||||||
}
|
}
|
||||||
@@ -354,7 +294,7 @@ async fn ranking_score_threshold() {
|
|||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(
|
||||||
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}),
|
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2}),
|
||||||
|response, code| {
|
|response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3");
|
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3");
|
||||||
@@ -365,16 +305,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.1,
|
||||||
[
|
0.6,
|
||||||
0.10000000149011612,
|
0.8
|
||||||
0.6000000238418579,
|
]
|
||||||
0.800000011920929
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.890957772731781
|
"_rankingScore": 0.890957772731781
|
||||||
},
|
},
|
||||||
@@ -383,16 +318,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "299537",
|
"id": "299537",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.6,
|
||||||
[
|
0.8,
|
||||||
0.6000000238418579,
|
-0.2
|
||||||
0.800000011920929,
|
]
|
||||||
-0.20000000298023224
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.39060014486312866
|
"_rankingScore": 0.39060014486312866
|
||||||
},
|
},
|
||||||
@@ -401,16 +331,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "166428",
|
"id": "166428",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.7,
|
||||||
[
|
0.7,
|
||||||
0.699999988079071,
|
-0.4
|
||||||
0.699999988079071,
|
]
|
||||||
-0.4000000059604645
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.2819308042526245
|
"_rankingScore": 0.2819308042526245
|
||||||
}
|
}
|
||||||
@@ -422,7 +347,7 @@ async fn ranking_score_threshold() {
|
|||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(
|
||||||
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}),
|
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3}),
|
||||||
|response, code| {
|
|response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2");
|
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2");
|
||||||
@@ -433,16 +358,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.1,
|
||||||
[
|
0.6,
|
||||||
0.10000000149011612,
|
0.8
|
||||||
0.6000000238418579,
|
]
|
||||||
0.800000011920929
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.890957772731781
|
"_rankingScore": 0.890957772731781
|
||||||
},
|
},
|
||||||
@@ -451,16 +371,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "299537",
|
"id": "299537",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.6,
|
||||||
[
|
0.8,
|
||||||
0.6000000238418579,
|
-0.2
|
||||||
0.800000011920929,
|
]
|
||||||
-0.20000000298023224
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.39060014486312866
|
"_rankingScore": 0.39060014486312866
|
||||||
}
|
}
|
||||||
@@ -472,7 +387,7 @@ async fn ranking_score_threshold() {
|
|||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(
|
||||||
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}),
|
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6}),
|
||||||
|response, code| {
|
|response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1");
|
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1");
|
||||||
@@ -483,16 +398,11 @@ async fn ranking_score_threshold() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.1,
|
||||||
[
|
0.6,
|
||||||
0.10000000149011612,
|
0.8
|
||||||
0.6000000238418579,
|
]
|
||||||
0.800000011920929
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"_rankingScore": 0.890957772731781
|
"_rankingScore": 0.890957772731781
|
||||||
}
|
}
|
||||||
@@ -504,7 +414,7 @@ async fn ranking_score_threshold() {
|
|||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(
|
||||||
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}),
|
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9}),
|
||||||
|response, code| {
|
|response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(json_string!(response["hits"]), @"[]");
|
snapshot!(json_string!(response["hits"]), @"[]");
|
||||||
@@ -546,97 +456,71 @@ async fn filter() {
|
|||||||
index.wait_task(value.uid()).await;
|
index.wait_task(value.uid()).await;
|
||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| {
|
||||||
json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}),
|
snapshot!(code, @"200 OK");
|
||||||
|response, code| {
|
snapshot!(json_string!(response["hits"]), @r###"
|
||||||
snapshot!(code, @"200 OK");
|
[
|
||||||
snapshot!(json_string!(response["hits"]), @r###"
|
{
|
||||||
[
|
"title": "Captain Marvel",
|
||||||
{
|
"release_year": 2019,
|
||||||
"title": "Captain Marvel",
|
"id": "299537",
|
||||||
"release_year": 2019,
|
"_vectors": {
|
||||||
"id": "299537",
|
"manual": [
|
||||||
"_vectors": {
|
0.6,
|
||||||
"manual": {
|
0.8,
|
||||||
"embeddings": [
|
-0.2
|
||||||
[
|
]
|
||||||
0.6000000238418579,
|
}
|
||||||
0.800000011920929,
|
},
|
||||||
-0.20000000298023224
|
{
|
||||||
]
|
"title": "How to Train Your Dragon: The Hidden World",
|
||||||
],
|
"release_year": 2019,
|
||||||
"regenerate": false
|
"id": "166428",
|
||||||
}
|
"_vectors": {
|
||||||
}
|
"manual": [
|
||||||
},
|
0.7,
|
||||||
{
|
0.7,
|
||||||
"title": "How to Train Your Dragon: The Hidden World",
|
-0.4
|
||||||
"release_year": 2019,
|
]
|
||||||
"id": "166428",
|
}
|
||||||
"_vectors": {
|
},
|
||||||
"manual": {
|
{
|
||||||
"embeddings": [
|
"title": "Shazam!",
|
||||||
[
|
"release_year": 2019,
|
||||||
0.699999988079071,
|
"id": "287947",
|
||||||
0.699999988079071,
|
"_vectors": {
|
||||||
-0.4000000059604645
|
"manual": [
|
||||||
]
|
0.8,
|
||||||
],
|
0.4,
|
||||||
"regenerate": false
|
-0.5
|
||||||
}
|
]
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
{
|
]
|
||||||
"title": "Shazam!",
|
"###);
|
||||||
"release_year": 2019,
|
})
|
||||||
"id": "287947",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
0.800000011920929,
|
|
||||||
0.4000000059604645,
|
|
||||||
-0.5
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| {
|
||||||
json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}),
|
snapshot!(code, @"200 OK");
|
||||||
|response, code| {
|
snapshot!(json_string!(response["hits"]), @r###"
|
||||||
snapshot!(code, @"200 OK");
|
[
|
||||||
snapshot!(json_string!(response["hits"]), @r###"
|
{
|
||||||
[
|
"title": "All Quiet on the Western Front",
|
||||||
{
|
"release_year": 1930,
|
||||||
"title": "All Quiet on the Western Front",
|
"id": "143",
|
||||||
"release_year": 1930,
|
"_vectors": {
|
||||||
"id": "143",
|
"manual": [
|
||||||
"_vectors": {
|
-0.5,
|
||||||
"manual": {
|
0.3,
|
||||||
"embeddings": [
|
0.85
|
||||||
[
|
]
|
||||||
-0.5,
|
}
|
||||||
0.30000001192092896,
|
}
|
||||||
0.8500000238418579
|
]
|
||||||
]
|
"###);
|
||||||
],
|
})
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -673,7 +557,7 @@ async fn limit_and_offset() {
|
|||||||
index.wait_task(value.uid()).await;
|
index.wait_task(value.uid()).await;
|
||||||
|
|
||||||
index
|
index
|
||||||
.similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| {
|
.similar(json!({"id": 143, "limit": 1}), |response, code| {
|
||||||
snapshot!(code, @"200 OK");
|
snapshot!(code, @"200 OK");
|
||||||
snapshot!(json_string!(response["hits"]), @r###"
|
snapshot!(json_string!(response["hits"]), @r###"
|
||||||
[
|
[
|
||||||
@@ -682,16 +566,11 @@ async fn limit_and_offset() {
|
|||||||
"release_year": 2019,
|
"release_year": 2019,
|
||||||
"id": "522681",
|
"id": "522681",
|
||||||
"_vectors": {
|
"_vectors": {
|
||||||
"manual": {
|
"manual": [
|
||||||
"embeddings": [
|
0.1,
|
||||||
[
|
0.6,
|
||||||
0.10000000149011612,
|
0.8
|
||||||
0.6000000238418579,
|
]
|
||||||
0.800000011920929
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -700,32 +579,24 @@ async fn limit_and_offset() {
|
|||||||
.await;
|
.await;
|
||||||
|
|
||||||
index
|
index
|
||||||
.similar(
|
.similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| {
|
||||||
json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}),
|
snapshot!(code, @"200 OK");
|
||||||
|response, code| {
|
snapshot!(json_string!(response["hits"]), @r###"
|
||||||
snapshot!(code, @"200 OK");
|
[
|
||||||
snapshot!(json_string!(response["hits"]), @r###"
|
{
|
||||||
[
|
"title": "Captain Marvel",
|
||||||
{
|
"release_year": 2019,
|
||||||
"title": "Captain Marvel",
|
"id": "299537",
|
||||||
"release_year": 2019,
|
"_vectors": {
|
||||||
"id": "299537",
|
"manual": [
|
||||||
"_vectors": {
|
0.6,
|
||||||
"manual": {
|
0.8,
|
||||||
"embeddings": [
|
-0.2
|
||||||
[
|
]
|
||||||
0.6000000238418579,
|
}
|
||||||
0.800000011920929,
|
}
|
||||||
-0.20000000298023224
|
]
|
||||||
]
|
"###);
|
||||||
],
|
})
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,588 +0,0 @@
|
|||||||
mod settings;
|
|
||||||
|
|
||||||
use meili_snap::{json_string, snapshot};
|
|
||||||
|
|
||||||
use crate::common::index::Index;
|
|
||||||
use crate::common::{GetAllDocumentsOptions, Server};
|
|
||||||
use crate::json;
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn add_remove_user_provided() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("doggo");
|
|
||||||
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(value, @r###"
|
|
||||||
{
|
|
||||||
"vectorStore": true,
|
|
||||||
"metrics": false,
|
|
||||||
"logsRoute": false
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.update_settings(json!({
|
|
||||||
"embedders": {
|
|
||||||
"manual": {
|
|
||||||
"source": "userProvided",
|
|
||||||
"dimensions": 3,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}))
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
server.wait_task(response.uid()).await;
|
|
||||||
|
|
||||||
let documents = json!([
|
|
||||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
|
|
||||||
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
|
|
||||||
]);
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
index.wait_task(value.uid()).await;
|
|
||||||
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"name": "echo",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
1.0,
|
|
||||||
1.0,
|
|
||||||
1.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 2
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let documents = json!([
|
|
||||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }},
|
|
||||||
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
|
|
||||||
]);
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
index.wait_task(value.uid()).await;
|
|
||||||
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
10.0,
|
|
||||||
10.0,
|
|
||||||
10.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"name": "echo",
|
|
||||||
"_vectors": {}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 2
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (value, code) = index.delete_document(0).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
index.wait_task(value.uid()).await;
|
|
||||||
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"name": "echo",
|
|
||||||
"_vectors": {}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 1
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn generate_default_user_provided_documents(server: &Server) -> Index {
|
|
||||||
let index = server.index("doggo");
|
|
||||||
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(value, @r###"
|
|
||||||
{
|
|
||||||
"vectorStore": true,
|
|
||||||
"metrics": false,
|
|
||||||
"logsRoute": false
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.update_settings(json!({
|
|
||||||
"embedders": {
|
|
||||||
"manual": {
|
|
||||||
"source": "userProvided",
|
|
||||||
"dimensions": 3,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}))
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
server.wait_task(response.uid()).await;
|
|
||||||
|
|
||||||
let documents = json!([
|
|
||||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
|
|
||||||
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
|
|
||||||
{"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }},
|
|
||||||
{"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}},
|
|
||||||
{"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}},
|
|
||||||
]);
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
index.wait_task(value.uid()).await;
|
|
||||||
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn user_provided_embeddings_error() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = generate_default_user_provided_documents(&server).await;
|
|
||||||
|
|
||||||
// First case, we forget to specify the `regenerate`
|
|
||||||
let documents =
|
|
||||||
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 2,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// Second case, we don't specify anything
|
|
||||||
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 3,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// Third case, we specify something wrong in place of regenerate
|
|
||||||
let documents =
|
|
||||||
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 4,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let documents =
|
|
||||||
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 5,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let documents =
|
|
||||||
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 6,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let documents =
|
|
||||||
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 7,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task["status"], @r###""succeeded""###);
|
|
||||||
|
|
||||||
let documents =
|
|
||||||
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task["status"], @r###""succeeded""###);
|
|
||||||
|
|
||||||
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [0.1, [0.2, 0.3]] }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 10,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, 0.2], 0.3] }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 11,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, true], 0.3] }}});
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, @r###"
|
|
||||||
{
|
|
||||||
"uid": 12,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "failed",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 0
|
|
||||||
},
|
|
||||||
"error": {
|
|
||||||
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
|
|
||||||
"code": "invalid_vectors_type",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
|
||||||
},
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn clear_documents() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = generate_default_user_provided_documents(&server).await;
|
|
||||||
|
|
||||||
let (value, _code) = index.clear_all_documents().await;
|
|
||||||
index.wait_task(value.uid()).await;
|
|
||||||
|
|
||||||
// Make sure the documents DB has been cleared
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 0
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// Make sure the arroy DB has been cleared
|
|
||||||
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
|
|
||||||
snapshot!(documents, @r###"
|
|
||||||
{
|
|
||||||
"hits": [],
|
|
||||||
"query": "",
|
|
||||||
"processingTimeMs": "[duration]",
|
|
||||||
"limit": 20,
|
|
||||||
"offset": 0,
|
|
||||||
"estimatedTotalHits": 0,
|
|
||||||
"semanticHitCount": 0
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn add_remove_one_vector_4588() {
|
|
||||||
// https://github.com/meilisearch/meilisearch/issues/4588
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("doggo");
|
|
||||||
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(value, @r###"
|
|
||||||
{
|
|
||||||
"vectorStore": true,
|
|
||||||
"metrics": false,
|
|
||||||
"logsRoute": false
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.update_settings(json!({
|
|
||||||
"embedders": {
|
|
||||||
"manual": {
|
|
||||||
"source": "userProvided",
|
|
||||||
"dimensions": 3,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}))
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = server.wait_task(response.uid()).await;
|
|
||||||
snapshot!(task, name: "settings-processed");
|
|
||||||
|
|
||||||
let documents = json!([
|
|
||||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
|
|
||||||
]);
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, name: "document-added");
|
|
||||||
|
|
||||||
let documents = json!([
|
|
||||||
{"id": 0, "name": "kefir", "_vectors": { "manual": null }},
|
|
||||||
]);
|
|
||||||
let (value, code) = index.add_documents(documents, None).await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
let task = index.wait_task(value.uid()).await;
|
|
||||||
snapshot!(task, name: "document-deleted");
|
|
||||||
|
|
||||||
let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await;
|
|
||||||
snapshot!(documents, @r###"
|
|
||||||
{
|
|
||||||
"hits": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"query": "",
|
|
||||||
"processingTimeMs": "[duration]",
|
|
||||||
"limit": 20,
|
|
||||||
"offset": 0,
|
|
||||||
"estimatedTotalHits": 1,
|
|
||||||
"semanticHitCount": 1
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 1
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
@@ -1,228 +0,0 @@
|
|||||||
use meili_snap::{json_string, snapshot};
|
|
||||||
|
|
||||||
use crate::common::{GetAllDocumentsOptions, Server};
|
|
||||||
use crate::json;
|
|
||||||
use crate::vector::generate_default_user_provided_documents;
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn update_embedder() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = server.index("doggo");
|
|
||||||
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
|
||||||
snapshot!(code, @"200 OK");
|
|
||||||
snapshot!(value, @r###"
|
|
||||||
{
|
|
||||||
"vectorStore": true,
|
|
||||||
"metrics": false,
|
|
||||||
"logsRoute": false
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.update_settings(json!({
|
|
||||||
"embedders": { "manual": {}},
|
|
||||||
}))
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
server.wait_task(response.uid()).await;
|
|
||||||
|
|
||||||
let (response, code) = index
|
|
||||||
.update_settings(json!({
|
|
||||||
"embedders": {
|
|
||||||
"manual": {
|
|
||||||
"source": "userProvided",
|
|
||||||
"dimensions": 2,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}))
|
|
||||||
.await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
|
|
||||||
let ret = server.wait_task(response.uid()).await;
|
|
||||||
snapshot!(ret, @r###"
|
|
||||||
{
|
|
||||||
"uid": 1,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "settingsUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"embedders": {
|
|
||||||
"manual": {
|
|
||||||
"source": "userProvided",
|
|
||||||
"dimensions": 2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_rt::test]
|
|
||||||
async fn reset_embedder_documents() {
|
|
||||||
let server = Server::new().await;
|
|
||||||
let index = generate_default_user_provided_documents(&server).await;
|
|
||||||
|
|
||||||
let (response, code) = index.delete_settings().await;
|
|
||||||
snapshot!(code, @"202 Accepted");
|
|
||||||
server.wait_task(response.uid()).await;
|
|
||||||
|
|
||||||
// Make sure the documents are still present
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions {
|
|
||||||
limit: None,
|
|
||||||
offset: None,
|
|
||||||
retrieve_vectors: false,
|
|
||||||
fields: None,
|
|
||||||
})
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"name": "echo"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"name": "billou"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"name": "intel"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"name": "max"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 5
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// Make sure we are still able to retrieve their vectors
|
|
||||||
let (documents, _code) = index
|
|
||||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
|
||||||
.await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"name": "kefir",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"name": "echo",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
1.0,
|
|
||||||
1.0,
|
|
||||||
1.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"name": "billou",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
2.0,
|
|
||||||
2.0,
|
|
||||||
2.0
|
|
||||||
],
|
|
||||||
[
|
|
||||||
2.0,
|
|
||||||
2.0,
|
|
||||||
3.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"name": "intel",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
3.0,
|
|
||||||
3.0,
|
|
||||||
3.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"name": "max",
|
|
||||||
"_vectors": {
|
|
||||||
"manual": {
|
|
||||||
"embeddings": [
|
|
||||||
[
|
|
||||||
4.0,
|
|
||||||
4.0,
|
|
||||||
4.0
|
|
||||||
],
|
|
||||||
[
|
|
||||||
4.0,
|
|
||||||
4.0,
|
|
||||||
5.0
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"regenerate": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"offset": 0,
|
|
||||||
"limit": 20,
|
|
||||||
"total": 5
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
|
|
||||||
// Make sure the arroy DB has been cleared
|
|
||||||
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
|
|
||||||
snapshot!(json_string!(documents), @r###"
|
|
||||||
{
|
|
||||||
"message": "Cannot find embedder with name `default`.",
|
|
||||||
"code": "invalid_embedder",
|
|
||||||
"type": "invalid_request",
|
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_embedder"
|
|
||||||
}
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
---
|
|
||||||
source: meilisearch/tests/vector/mod.rs
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"uid": 1,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 1
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
---
|
|
||||||
source: meilisearch/tests/vector/mod.rs
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"uid": 2,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "documentAdditionOrUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"receivedDocuments": 1,
|
|
||||||
"indexedDocuments": 1
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
source: meilisearch/tests/vector/mod.rs
|
|
||||||
---
|
|
||||||
{
|
|
||||||
"uid": 0,
|
|
||||||
"indexUid": "doggo",
|
|
||||||
"status": "succeeded",
|
|
||||||
"type": "settingsUpdate",
|
|
||||||
"canceledBy": null,
|
|
||||||
"details": {
|
|
||||||
"embedders": {
|
|
||||||
"manual": {
|
|
||||||
"source": "userProvided",
|
|
||||||
"dimensions": 3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"error": null,
|
|
||||||
"duration": "[duration]",
|
|
||||||
"enqueuedAt": "[date]",
|
|
||||||
"startedAt": "[date]",
|
|
||||||
"finishedAt": "[date]"
|
|
||||||
}
|
|
||||||
@@ -44,7 +44,7 @@ once_cell = "1.19.0"
|
|||||||
ordered-float = "4.2.0"
|
ordered-float = "4.2.0"
|
||||||
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
||||||
rayon = "1.8.0"
|
rayon = "1.8.0"
|
||||||
roaring = { version = "0.10.2", features = ["serde"] }
|
roaring = "0.10.2"
|
||||||
rstar = { version = "0.11.0", features = ["serde"] }
|
rstar = { version = "0.11.0", features = ["serde"] }
|
||||||
serde = { version = "1.0.195", features = ["derive"] }
|
serde = { version = "1.0.195", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.111", features = ["preserve_order"] }
|
serde_json = { version = "1.0.111", features = ["preserve_order"] }
|
||||||
@@ -71,15 +71,15 @@ csv = "1.3.0"
|
|||||||
candle-core = { version = "0.4.1" }
|
candle-core = { version = "0.4.1" }
|
||||||
candle-transformers = { version = "0.4.1" }
|
candle-transformers = { version = "0.4.1" }
|
||||||
candle-nn = { version = "0.4.1" }
|
candle-nn = { version = "0.4.1" }
|
||||||
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
|
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [
|
||||||
"onig",
|
"onig",
|
||||||
] }
|
] }
|
||||||
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
|
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
|
||||||
"online",
|
"online",
|
||||||
] }
|
] }
|
||||||
tiktoken-rs = "0.5.8"
|
tiktoken-rs = "0.5.8"
|
||||||
liquid = "0.26.4"
|
liquid = "0.26.4"
|
||||||
arroy = "0.4.0"
|
arroy = "0.3.1"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
ureq = { version = "2.9.7", features = ["json"] }
|
ureq = { version = "2.9.7", features = ["json"] }
|
||||||
|
|||||||
@@ -59,7 +59,6 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
false,
|
false,
|
||||||
universe,
|
universe,
|
||||||
&None,
|
&None,
|
||||||
&None,
|
|
||||||
GeoSortStrategy::default(),
|
GeoSortStrategy::default(),
|
||||||
0,
|
0,
|
||||||
20,
|
20,
|
||||||
|
|||||||
@@ -119,8 +119,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
InvalidVectorDimensions { expected: usize, found: usize },
|
InvalidVectorDimensions { expected: usize, found: usize },
|
||||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||||
InvalidVectorsMapType { document_id: String, value: Value },
|
InvalidVectorsMapType { document_id: String, value: Value },
|
||||||
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
|
||||||
InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError },
|
|
||||||
#[error("{0}")]
|
#[error("{0}")]
|
||||||
InvalidFilter(String),
|
InvalidFilter(String),
|
||||||
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
||||||
@@ -136,17 +134,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
}
|
}
|
||||||
)]
|
)]
|
||||||
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
|
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
|
||||||
#[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}",
|
|
||||||
.field,
|
|
||||||
match .valid_fields.is_empty() {
|
|
||||||
true => "This index does not have configured filterable attributes.".to_string(),
|
|
||||||
false => format!("Available filterable attributes are: `{}{}`.",
|
|
||||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
|
||||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
|
||||||
),
|
|
||||||
}
|
|
||||||
)]
|
|
||||||
InvalidDistinctAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
|
|
||||||
#[error("Attribute `{}` is not facet-searchable. {}",
|
#[error("Attribute `{}` is not facet-searchable. {}",
|
||||||
.field,
|
.field,
|
||||||
match .valid_fields.is_empty() {
|
match .valid_fields.is_empty() {
|
||||||
@@ -283,9 +270,8 @@ impl From<arroy::Error> for Error {
|
|||||||
arroy::Error::DatabaseFull
|
arroy::Error::DatabaseFull
|
||||||
| arroy::Error::InvalidItemAppend
|
| arroy::Error::InvalidItemAppend
|
||||||
| arroy::Error::UnmatchingDistance { .. }
|
| arroy::Error::UnmatchingDistance { .. }
|
||||||
| arroy::Error::NeedBuild(_)
|
| arroy::Error::MissingNode
|
||||||
| arroy::Error::MissingKey { .. }
|
| arroy::Error::MissingMetadata => {
|
||||||
| arroy::Error::MissingMetadata(_) => {
|
|
||||||
Error::InternalError(InternalError::ArroyError(value))
|
Error::InternalError(InternalError::ArroyError(value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
|
||||||
use crate::{FieldId, FieldsIdsMap, Weight};
|
use crate::{FieldId, FieldsIdsMap, Weight};
|
||||||
|
|
||||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||||
@@ -24,13 +23,7 @@ impl FieldidsWeightsMap {
|
|||||||
/// Should only be called in the case there are NO searchable attributes.
|
/// Should only be called in the case there are NO searchable attributes.
|
||||||
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
|
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
|
||||||
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
|
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
|
||||||
FieldidsWeightsMap {
|
FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() }
|
||||||
map: fid_map
|
|
||||||
.iter()
|
|
||||||
.filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
|
|
||||||
.map(|(fid, _name)| (fid, 0))
|
|
||||||
.collect(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Removes a field id from the map, returning the associated weight previously in the map.
|
/// Removes a field id from the map, returning the associated weight previously in the map.
|
||||||
|
|||||||
@@ -41,16 +41,6 @@ impl FieldsIdsMap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the ids of a field and all its nested fields based on its name.
|
|
||||||
pub fn nested_ids(&self, name: &str) -> Vec<FieldId> {
|
|
||||||
self.names_ids
|
|
||||||
.range(name.to_string()..)
|
|
||||||
.take_while(|(key, _)| key.starts_with(name))
|
|
||||||
.filter(|(key, _)| crate::is_faceted_by(key, name))
|
|
||||||
.map(|(_name, id)| *id)
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the id of a field based on its name.
|
/// Get the id of a field based on its name.
|
||||||
pub fn id(&self, name: &str) -> Option<FieldId> {
|
pub fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
self.names_ids.get(name).copied()
|
self.names_ids.get(name).copied()
|
||||||
@@ -136,32 +126,4 @@ mod tests {
|
|||||||
assert_eq!(iter.next(), Some((3, "title")));
|
assert_eq!(iter.next(), Some((3, "title")));
|
||||||
assert_eq!(iter.next(), None);
|
assert_eq!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn nested_fields() {
|
|
||||||
let mut map = FieldsIdsMap::new();
|
|
||||||
|
|
||||||
assert_eq!(map.insert("id"), Some(0));
|
|
||||||
assert_eq!(map.insert("doggo"), Some(1));
|
|
||||||
assert_eq!(map.insert("doggo.name"), Some(2));
|
|
||||||
assert_eq!(map.insert("doggolution"), Some(3));
|
|
||||||
assert_eq!(map.insert("doggo.breed.name"), Some(4));
|
|
||||||
assert_eq!(map.insert("description"), Some(5));
|
|
||||||
|
|
||||||
insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###"
|
|
||||||
[
|
|
||||||
1,
|
|
||||||
4,
|
|
||||||
2,
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
|
|
||||||
insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###"
|
|
||||||
[
|
|
||||||
4,
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
|
|
||||||
insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ use heed::types::*;
|
|||||||
use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
|
use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use rstar::RTree;
|
use rstar::RTree;
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
use crate::documents::PrimaryKey;
|
use crate::documents::PrimaryKey;
|
||||||
@@ -24,7 +23,6 @@ use crate::heed_codec::{
|
|||||||
};
|
};
|
||||||
use crate::order_by_map::OrderByMap;
|
use crate::order_by_map::OrderByMap;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
|
||||||
use crate::vector::{Embedding, EmbeddingConfig};
|
use crate::vector::{Embedding, EmbeddingConfig};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
@@ -646,7 +644,6 @@ impl Index {
|
|||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
user_fields: &[&str],
|
user_fields: &[&str],
|
||||||
non_searchable_fields_ids: &[FieldId],
|
|
||||||
fields_ids_map: &FieldsIdsMap,
|
fields_ids_map: &FieldsIdsMap,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// We can write the user defined searchable fields as-is.
|
// We can write the user defined searchable fields as-is.
|
||||||
@@ -665,7 +662,6 @@ impl Index {
|
|||||||
for (weight, user_field) in user_fields.iter().enumerate() {
|
for (weight, user_field) in user_fields.iter().enumerate() {
|
||||||
if crate::is_faceted_by(field_from_map, user_field)
|
if crate::is_faceted_by(field_from_map, user_field)
|
||||||
&& !real_fields.contains(&field_from_map)
|
&& !real_fields.contains(&field_from_map)
|
||||||
&& !non_searchable_fields_ids.contains(&id)
|
|
||||||
{
|
{
|
||||||
real_fields.push(field_from_map);
|
real_fields.push(field_from_map);
|
||||||
|
|
||||||
@@ -712,7 +708,6 @@ impl Index {
|
|||||||
Ok(self
|
Ok(self
|
||||||
.fields_ids_map(rtxn)?
|
.fields_ids_map(rtxn)?
|
||||||
.names()
|
.names()
|
||||||
.filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
|
|
||||||
.map(|field| Cow::Owned(field.to_string()))
|
.map(|field| Cow::Owned(field.to_string()))
|
||||||
.collect())
|
.collect())
|
||||||
})
|
})
|
||||||
@@ -1573,16 +1568,12 @@ impl Index {
|
|||||||
Ok(script_language)
|
Ok(script_language)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Put the embedding configs:
|
|
||||||
/// 1. The name of the embedder
|
|
||||||
/// 2. The configuration option for this embedder
|
|
||||||
/// 3. The list of documents with a user provided embedding
|
|
||||||
pub(crate) fn put_embedding_configs(
|
pub(crate) fn put_embedding_configs(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
configs: Vec<IndexEmbeddingConfig>,
|
configs: Vec<(String, EmbeddingConfig)>,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
self.main.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>().put(
|
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put(
|
||||||
wtxn,
|
wtxn,
|
||||||
main_key::EMBEDDING_CONFIGS,
|
main_key::EMBEDDING_CONFIGS,
|
||||||
&configs,
|
&configs,
|
||||||
@@ -1593,10 +1584,13 @@ impl Index {
|
|||||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS)
|
self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result<Vec<IndexEmbeddingConfig>> {
|
pub fn embedding_configs(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn<'_>,
|
||||||
|
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.main
|
.main
|
||||||
.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>()
|
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>()
|
||||||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
@@ -1610,7 +1604,7 @@ impl Index {
|
|||||||
arroy::Reader::open(rtxn, k, self.vector_arroy)
|
arroy::Reader::open(rtxn, k, self.vector_arroy)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.or_else(|e| match e {
|
.or_else(|e| match e {
|
||||||
arroy::Error::MissingMetadata(_) => Ok(None),
|
arroy::Error::MissingMetadata => Ok(None),
|
||||||
e => Err(e.into()),
|
e => Err(e.into()),
|
||||||
})
|
})
|
||||||
.transpose()
|
.transpose()
|
||||||
@@ -1643,7 +1637,7 @@ impl Index {
|
|||||||
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.or_else(|e| match e {
|
.or_else(|e| match e {
|
||||||
arroy::Error::MissingMetadata(_) => Ok(None),
|
arroy::Error::MissingMetadata => Ok(None),
|
||||||
e => Err(e),
|
e => Err(e),
|
||||||
})
|
})
|
||||||
.transpose();
|
.transpose();
|
||||||
@@ -1668,13 +1662,6 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
|
||||||
pub struct IndexEmbeddingConfig {
|
|
||||||
pub name: String,
|
|
||||||
pub config: EmbeddingConfig,
|
|
||||||
pub user_provided: RoaringBitmap,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) mod tests {
|
pub(crate) mod tests {
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
@@ -1682,17 +1669,15 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use heed::{EnvOpenOptions, RwTxn};
|
use heed::{EnvOpenOptions, RwTxn};
|
||||||
use maplit::{btreemap, hashset};
|
use maplit::hashset;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
use crate::documents::DocumentsBatchReader;
|
use crate::documents::DocumentsBatchReader;
|
||||||
use crate::error::{Error, InternalError};
|
use crate::error::{Error, InternalError};
|
||||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
|
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||||
Settings,
|
|
||||||
};
|
};
|
||||||
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
|
|
||||||
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
|
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
|
||||||
|
|
||||||
pub(crate) struct TempIndex {
|
pub(crate) struct TempIndex {
|
||||||
@@ -2798,95 +2783,4 @@ pub(crate) mod tests {
|
|||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn vectors_are_never_indexed_as_searchable_or_filterable() {
|
|
||||||
let index = TempIndex::new();
|
|
||||||
|
|
||||||
index
|
|
||||||
.add_documents(documents!([
|
|
||||||
{ "id": 0, "_vectors": { "doggo": [2345] } },
|
|
||||||
{ "id": 1, "_vectors": { "doggo": [6789] } },
|
|
||||||
]))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, fields_ids_map, @r###"
|
|
||||||
0 id |
|
|
||||||
1 _vectors |
|
|
||||||
2 _vectors.doggo |
|
|
||||||
"###);
|
|
||||||
db_snap!(index, searchable_fields, @r###"["id"]"###);
|
|
||||||
db_snap!(index, fieldids_weights_map, @r###"
|
|
||||||
fid weight
|
|
||||||
0 0 |
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let mut search = index.search(&rtxn);
|
|
||||||
let results = search.query("2345").execute().unwrap();
|
|
||||||
assert!(results.candidates.is_empty());
|
|
||||||
drop(rtxn);
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]);
|
|
||||||
settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, fields_ids_map, @r###"
|
|
||||||
0 id |
|
|
||||||
1 _vectors |
|
|
||||||
2 _vectors.doggo |
|
|
||||||
"###);
|
|
||||||
db_snap!(index, searchable_fields, @"[]");
|
|
||||||
db_snap!(index, fieldids_weights_map, @r###"
|
|
||||||
fid weight
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let mut search = index.search(&rtxn);
|
|
||||||
let results = search.query("2345").execute().unwrap();
|
|
||||||
assert!(results.candidates.is_empty());
|
|
||||||
|
|
||||||
let mut search = index.search(&rtxn);
|
|
||||||
let results = search
|
|
||||||
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
|
|
||||||
.execute()
|
|
||||||
.unwrap();
|
|
||||||
assert!(results.candidates.is_empty());
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_embedder_settings(btreemap! {
|
|
||||||
S("doggo") => Setting::Set(EmbeddingSettings {
|
|
||||||
dimensions: Setting::Set(1),
|
|
||||||
source: Setting::Set(EmbedderSource::UserProvided),
|
|
||||||
..EmbeddingSettings::default()}),
|
|
||||||
});
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, fields_ids_map, @r###"
|
|
||||||
0 id |
|
|
||||||
1 _vectors |
|
|
||||||
2 _vectors.doggo |
|
|
||||||
"###);
|
|
||||||
db_snap!(index, searchable_fields, @"[]");
|
|
||||||
db_snap!(index, fieldids_weights_map, @r###"
|
|
||||||
fid weight
|
|
||||||
"###);
|
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let mut search = index.search(&rtxn);
|
|
||||||
let results = search.query("2345").execute().unwrap();
|
|
||||||
assert!(results.candidates.is_empty());
|
|
||||||
|
|
||||||
let mut search = index.search(&rtxn);
|
|
||||||
let results = search
|
|
||||||
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
|
|
||||||
.execute()
|
|
||||||
.unwrap();
|
|
||||||
assert!(results.candidates.is_empty());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,11 +6,9 @@ use heed::Result;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{get_first_facet_value, get_highest_level};
|
use super::{get_first_facet_value, get_highest_level};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
|
||||||
};
|
|
||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId};
|
use crate::DocumentId;
|
||||||
|
|
||||||
/// Call the given closure on the facet distribution of the candidate documents.
|
/// Call the given closure on the facet distribution of the candidate documents.
|
||||||
///
|
///
|
||||||
@@ -33,9 +31,12 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
|
|||||||
where
|
where
|
||||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||||
{
|
{
|
||||||
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
|
||||||
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
|
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
|
||||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
let highest_level = get_highest_level(
|
||||||
|
rtxn,
|
||||||
|
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||||
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
||||||
@@ -74,8 +75,11 @@ where
|
|||||||
|
|
||||||
// Represents the list of keys that we must explore.
|
// Represents the list of keys that we must explore.
|
||||||
let mut heap = BinaryHeap::new();
|
let mut heap = BinaryHeap::new();
|
||||||
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
let highest_level = get_highest_level(
|
||||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
rtxn,
|
||||||
|
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||||
|
field_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||||
// We first fill the heap with values from the highest level
|
// We first fill the heap with values from the highest level
|
||||||
@@ -88,10 +92,7 @@ where
|
|||||||
if key.field_id != field_id {
|
if key.field_id != field_id {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
let intersection = value.bitmap & candidates;
|
||||||
value.bitmap_bytes,
|
|
||||||
candidates,
|
|
||||||
)?;
|
|
||||||
let count = intersection.len();
|
let count = intersection.len();
|
||||||
if count != 0 {
|
if count != 0 {
|
||||||
heap.push(LevelEntry {
|
heap.push(LevelEntry {
|
||||||
@@ -120,10 +121,7 @@ where
|
|||||||
if key.field_id != field_id {
|
if key.field_id != field_id {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
let intersection = value.bitmap & candidates;
|
||||||
value.bitmap_bytes,
|
|
||||||
candidates,
|
|
||||||
)?;
|
|
||||||
let count = intersection.len();
|
let count = intersection.len();
|
||||||
if count != 0 {
|
if count != 0 {
|
||||||
heap.push(LevelEntry {
|
heap.push(LevelEntry {
|
||||||
@@ -148,7 +146,7 @@ where
|
|||||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||||
{
|
{
|
||||||
rtxn: &'t heed::RoTxn<'t>,
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
|
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
callback: CB,
|
callback: CB,
|
||||||
}
|
}
|
||||||
@@ -173,10 +171,7 @@ where
|
|||||||
if key.field_id != self.field_id {
|
if key.field_id != self.field_id {
|
||||||
return Ok(ControlFlow::Break(()));
|
return Ok(ControlFlow::Break(()));
|
||||||
}
|
}
|
||||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
let docids_in_common = value.bitmap & candidates;
|
||||||
value.bitmap_bytes,
|
|
||||||
candidates,
|
|
||||||
)?;
|
|
||||||
if !docids_in_common.is_empty() {
|
if !docids_in_common.is_empty() {
|
||||||
let any_docid_in_common = docids_in_common.min().unwrap();
|
let any_docid_in_common = docids_in_common.min().unwrap();
|
||||||
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
|
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
|
||||||
@@ -210,10 +205,7 @@ where
|
|||||||
if key.field_id != self.field_id {
|
if key.field_id != self.field_id {
|
||||||
return Ok(ControlFlow::Break(()));
|
return Ok(ControlFlow::Break(()));
|
||||||
}
|
}
|
||||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
let docids_in_common = value.bitmap & candidates;
|
||||||
value.bitmap_bytes,
|
|
||||||
candidates,
|
|
||||||
)?;
|
|
||||||
if !docids_in_common.is_empty() {
|
if !docids_in_common.is_empty() {
|
||||||
let cf = self.iterate(
|
let cf = self.iterate(
|
||||||
&docids_in_common,
|
&docids_in_common,
|
||||||
|
|||||||
@@ -159,7 +159,6 @@ impl<'a> Search<'a> {
|
|||||||
offset: 0,
|
offset: 0,
|
||||||
limit: self.limit + self.offset,
|
limit: self.limit + self.offset,
|
||||||
sort_criteria: self.sort_criteria.clone(),
|
sort_criteria: self.sort_criteria.clone(),
|
||||||
distinct: self.distinct.clone(),
|
|
||||||
searchable_attributes: self.searchable_attributes,
|
searchable_attributes: self.searchable_attributes,
|
||||||
geo_strategy: self.geo_strategy,
|
geo_strategy: self.geo_strategy,
|
||||||
terms_matching_strategy: self.terms_matching_strategy,
|
terms_matching_strategy: self.terms_matching_strategy,
|
||||||
@@ -178,16 +177,16 @@ impl<'a> Search<'a> {
|
|||||||
|
|
||||||
// completely skip semantic search if the results of the keyword search are good enough
|
// completely skip semantic search if the results of the keyword search are good enough
|
||||||
if self.results_good_enough(&keyword_results, semantic_ratio) {
|
if self.results_good_enough(&keyword_results, semantic_ratio) {
|
||||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
return Ok((keyword_results, Some(0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// no vector search against placeholder search
|
// no vector search against placeholder search
|
||||||
let Some(query) = search.query.take() else {
|
let Some(query) = search.query.take() else {
|
||||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
return Ok((keyword_results, Some(0)));
|
||||||
};
|
};
|
||||||
// no embedder, no semantic search
|
// no embedder, no semantic search
|
||||||
let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else {
|
let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else {
|
||||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
return Ok((keyword_results, Some(0)));
|
||||||
};
|
};
|
||||||
|
|
||||||
let vector_query = match vector {
|
let vector_query = match vector {
|
||||||
@@ -239,44 +238,3 @@ impl<'a> Search<'a> {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn return_keyword_results(
|
|
||||||
limit: usize,
|
|
||||||
offset: usize,
|
|
||||||
SearchResult {
|
|
||||||
matching_words,
|
|
||||||
candidates,
|
|
||||||
mut documents_ids,
|
|
||||||
mut document_scores,
|
|
||||||
degraded,
|
|
||||||
used_negative_operator,
|
|
||||||
}: SearchResult,
|
|
||||||
) -> (SearchResult, Option<u32>) {
|
|
||||||
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
|
|
||||||
// technically redudant because documents_ids.len() == document_scores.len(),
|
|
||||||
// defensive programming
|
|
||||||
offset >= document_scores.len()
|
|
||||||
{
|
|
||||||
(vec![], vec![])
|
|
||||||
} else {
|
|
||||||
// PANICS: offset < len
|
|
||||||
documents_ids.rotate_left(offset);
|
|
||||||
documents_ids.truncate(limit);
|
|
||||||
|
|
||||||
// PANICS: offset < len
|
|
||||||
document_scores.rotate_left(offset);
|
|
||||||
document_scores.truncate(limit);
|
|
||||||
(documents_ids, document_scores)
|
|
||||||
};
|
|
||||||
(
|
|
||||||
SearchResult {
|
|
||||||
matching_words,
|
|
||||||
candidates,
|
|
||||||
documents_ids,
|
|
||||||
document_scores,
|
|
||||||
degraded,
|
|
||||||
used_negative_operator,
|
|
||||||
},
|
|
||||||
Some(0),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ use self::new::{execute_vector_search, PartialSearchResult};
|
|||||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||||
use crate::vector::Embedder;
|
use crate::vector::Embedder;
|
||||||
use crate::{
|
use crate::{
|
||||||
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index,
|
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result,
|
||||||
Result, SearchContext, TimeBudget, UserError,
|
SearchContext, TimeBudget,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Building these factories is not free.
|
// Building these factories is not free.
|
||||||
@@ -40,7 +40,6 @@ pub struct Search<'a> {
|
|||||||
offset: usize,
|
offset: usize,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
sort_criteria: Option<Vec<AscDesc>>,
|
sort_criteria: Option<Vec<AscDesc>>,
|
||||||
distinct: Option<String>,
|
|
||||||
searchable_attributes: Option<&'a [String]>,
|
searchable_attributes: Option<&'a [String]>,
|
||||||
geo_strategy: new::GeoSortStrategy,
|
geo_strategy: new::GeoSortStrategy,
|
||||||
terms_matching_strategy: TermsMatchingStrategy,
|
terms_matching_strategy: TermsMatchingStrategy,
|
||||||
@@ -62,7 +61,6 @@ impl<'a> Search<'a> {
|
|||||||
offset: 0,
|
offset: 0,
|
||||||
limit: 20,
|
limit: 20,
|
||||||
sort_criteria: None,
|
sort_criteria: None,
|
||||||
distinct: None,
|
|
||||||
searchable_attributes: None,
|
searchable_attributes: None,
|
||||||
geo_strategy: new::GeoSortStrategy::default(),
|
geo_strategy: new::GeoSortStrategy::default(),
|
||||||
terms_matching_strategy: TermsMatchingStrategy::default(),
|
terms_matching_strategy: TermsMatchingStrategy::default(),
|
||||||
@@ -107,11 +105,6 @@ impl<'a> Search<'a> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn distinct(&mut self, distinct: String) -> &mut Search<'a> {
|
|
||||||
self.distinct = Some(distinct);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> {
|
pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> {
|
||||||
self.searchable_attributes = Some(searchable);
|
self.searchable_attributes = Some(searchable);
|
||||||
self
|
self
|
||||||
@@ -176,19 +169,6 @@ impl<'a> Search<'a> {
|
|||||||
ctx.attributes_to_search_on(searchable_attributes)?;
|
ctx.attributes_to_search_on(searchable_attributes)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(distinct) = &self.distinct {
|
|
||||||
let filterable_fields = ctx.index.filterable_fields(ctx.txn)?;
|
|
||||||
if !crate::is_faceted(distinct, &filterable_fields) {
|
|
||||||
let (valid_fields, hidden_fields) =
|
|
||||||
ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?;
|
|
||||||
return Err(Error::UserError(UserError::InvalidDistinctAttribute {
|
|
||||||
field: distinct.clone(),
|
|
||||||
valid_fields,
|
|
||||||
hidden_fields,
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
|
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
|
||||||
let PartialSearchResult {
|
let PartialSearchResult {
|
||||||
located_query_terms,
|
located_query_terms,
|
||||||
@@ -205,7 +185,6 @@ impl<'a> Search<'a> {
|
|||||||
self.scoring_strategy,
|
self.scoring_strategy,
|
||||||
universe,
|
universe,
|
||||||
&self.sort_criteria,
|
&self.sort_criteria,
|
||||||
&self.distinct,
|
|
||||||
self.geo_strategy,
|
self.geo_strategy,
|
||||||
self.offset,
|
self.offset,
|
||||||
self.limit,
|
self.limit,
|
||||||
@@ -223,7 +202,6 @@ impl<'a> Search<'a> {
|
|||||||
self.exhaustive_number_hits,
|
self.exhaustive_number_hits,
|
||||||
universe,
|
universe,
|
||||||
&self.sort_criteria,
|
&self.sort_criteria,
|
||||||
&self.distinct,
|
|
||||||
self.geo_strategy,
|
self.geo_strategy,
|
||||||
self.offset,
|
self.offset,
|
||||||
self.limit,
|
self.limit,
|
||||||
@@ -260,7 +238,6 @@ impl fmt::Debug for Search<'_> {
|
|||||||
offset,
|
offset,
|
||||||
limit,
|
limit,
|
||||||
sort_criteria,
|
sort_criteria,
|
||||||
distinct,
|
|
||||||
searchable_attributes,
|
searchable_attributes,
|
||||||
geo_strategy: _,
|
geo_strategy: _,
|
||||||
terms_matching_strategy,
|
terms_matching_strategy,
|
||||||
@@ -280,7 +257,6 @@ impl fmt::Debug for Search<'_> {
|
|||||||
.field("offset", offset)
|
.field("offset", offset)
|
||||||
.field("limit", limit)
|
.field("limit", limit)
|
||||||
.field("sort_criteria", sort_criteria)
|
.field("sort_criteria", sort_criteria)
|
||||||
.field("distinct", distinct)
|
|
||||||
.field("searchable_attributes", searchable_attributes)
|
.field("searchable_attributes", searchable_attributes)
|
||||||
.field("terms_matching_strategy", terms_matching_strategy)
|
.field("terms_matching_strategy", terms_matching_strategy)
|
||||||
.field("scoring_strategy", scoring_strategy)
|
.field("scoring_strategy", scoring_strategy)
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||||||
ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||||
query: &Q,
|
query: &Q,
|
||||||
distinct: Option<&str>,
|
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
from: usize,
|
from: usize,
|
||||||
length: usize,
|
length: usize,
|
||||||
@@ -35,12 +34,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||||||
logger.ranking_rules(&ranking_rules);
|
logger.ranking_rules(&ranking_rules);
|
||||||
logger.initial_universe(universe);
|
logger.initial_universe(universe);
|
||||||
|
|
||||||
let distinct_field = match distinct {
|
let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
|
||||||
Some(distinct) => Some(distinct),
|
|
||||||
None => ctx.index.distinct_field(ctx.txn)?,
|
|
||||||
};
|
|
||||||
|
|
||||||
let distinct_fid = if let Some(field) = distinct_field {
|
|
||||||
ctx.index.fields_ids_map(ctx.txn)?.id(field)
|
ctx.index.fields_ids_map(ctx.txn)?.id(field)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ pub enum SearchEvents {
|
|||||||
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
|
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
|
||||||
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
|
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
|
||||||
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
|
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
|
||||||
RankingRuleEndIteration { ranking_rule_idx: usize },
|
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
|
||||||
ExtendResults { new: Vec<u32> },
|
ExtendResults { new: Vec<u32> },
|
||||||
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
|
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
|
||||||
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
|
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
|
||||||
@@ -123,9 +123,12 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
|
|||||||
&mut self,
|
&mut self,
|
||||||
ranking_rule_idx: usize,
|
ranking_rule_idx: usize,
|
||||||
_ranking_rule: &dyn RankingRule<QueryGraph>,
|
_ranking_rule: &dyn RankingRule<QueryGraph>,
|
||||||
_universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) {
|
) {
|
||||||
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
|
self.events.push(SearchEvents::RankingRuleEndIteration {
|
||||||
|
ranking_rule_idx,
|
||||||
|
universe_len: universe.len(),
|
||||||
|
});
|
||||||
self.location.pop();
|
self.location.pop();
|
||||||
}
|
}
|
||||||
fn add_to_results(&mut self, docids: &[u32]) {
|
fn add_to_results(&mut self, docids: &[u32]) {
|
||||||
@@ -323,7 +326,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
|
|||||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||||
self.write_skip_bucket(bucket_len)?;
|
self.write_skip_bucket(bucket_len)?;
|
||||||
}
|
}
|
||||||
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
|
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => {
|
||||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||||
self.write_end_iteration()?;
|
self.write_end_iteration()?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -516,7 +516,6 @@ mod tests {
|
|||||||
false,
|
false,
|
||||||
universe,
|
universe,
|
||||||
&None,
|
&None,
|
||||||
&None,
|
|
||||||
crate::search::new::GeoSortStrategy::default(),
|
crate::search::new::GeoSortStrategy::default(),
|
||||||
0,
|
0,
|
||||||
100,
|
100,
|
||||||
|
|||||||
@@ -568,7 +568,6 @@ pub fn execute_vector_search(
|
|||||||
scoring_strategy: ScoringStrategy,
|
scoring_strategy: ScoringStrategy,
|
||||||
universe: RoaringBitmap,
|
universe: RoaringBitmap,
|
||||||
sort_criteria: &Option<Vec<AscDesc>>,
|
sort_criteria: &Option<Vec<AscDesc>>,
|
||||||
distinct: &Option<String>,
|
|
||||||
geo_strategy: geo_sort::Strategy,
|
geo_strategy: geo_sort::Strategy,
|
||||||
from: usize,
|
from: usize,
|
||||||
length: usize,
|
length: usize,
|
||||||
@@ -599,7 +598,6 @@ pub fn execute_vector_search(
|
|||||||
ctx,
|
ctx,
|
||||||
ranking_rules,
|
ranking_rules,
|
||||||
&PlaceholderQuery,
|
&PlaceholderQuery,
|
||||||
distinct.as_deref(),
|
|
||||||
&universe,
|
&universe,
|
||||||
from,
|
from,
|
||||||
length,
|
length,
|
||||||
@@ -629,7 +627,6 @@ pub fn execute_search(
|
|||||||
exhaustive_number_hits: bool,
|
exhaustive_number_hits: bool,
|
||||||
mut universe: RoaringBitmap,
|
mut universe: RoaringBitmap,
|
||||||
sort_criteria: &Option<Vec<AscDesc>>,
|
sort_criteria: &Option<Vec<AscDesc>>,
|
||||||
distinct: &Option<String>,
|
|
||||||
geo_strategy: geo_sort::Strategy,
|
geo_strategy: geo_sort::Strategy,
|
||||||
from: usize,
|
from: usize,
|
||||||
length: usize,
|
length: usize,
|
||||||
@@ -720,7 +717,6 @@ pub fn execute_search(
|
|||||||
ctx,
|
ctx,
|
||||||
ranking_rules,
|
ranking_rules,
|
||||||
&graph,
|
&graph,
|
||||||
distinct.as_deref(),
|
|
||||||
&universe,
|
&universe,
|
||||||
from,
|
from,
|
||||||
length,
|
length,
|
||||||
@@ -736,7 +732,6 @@ pub fn execute_search(
|
|||||||
ctx,
|
ctx,
|
||||||
ranking_rules,
|
ranking_rules,
|
||||||
&PlaceholderQuery,
|
&PlaceholderQuery,
|
||||||
distinct.as_deref(),
|
|
||||||
&universe,
|
&universe,
|
||||||
from,
|
from,
|
||||||
length,
|
length,
|
||||||
@@ -753,12 +748,7 @@ pub fn execute_search(
|
|||||||
// The candidates is the universe unless the exhaustive number of hits
|
// The candidates is the universe unless the exhaustive number of hits
|
||||||
// is requested and a distinct attribute is set.
|
// is requested and a distinct attribute is set.
|
||||||
if exhaustive_number_hits {
|
if exhaustive_number_hits {
|
||||||
let distinct_field = match distinct.as_deref() {
|
if let Some(f) = ctx.index.distinct_field(ctx.txn)? {
|
||||||
Some(distinct) => Some(distinct),
|
|
||||||
None => ctx.index.distinct_field(ctx.txn)?,
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(f) = distinct_field {
|
|
||||||
if let Some(distinct_fid) = fields_ids_map.id(f) {
|
if let Some(distinct_fid) = fields_ids_map.id(f) {
|
||||||
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
|
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -205,18 +205,8 @@ fn create_index() -> TempIndex {
|
|||||||
index
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
fn verify_distinct(
|
fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec<String> {
|
||||||
index: &Index,
|
let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids);
|
||||||
txn: &RoTxn,
|
|
||||||
distinct: Option<&str>,
|
|
||||||
docids: &[u32],
|
|
||||||
) -> Vec<String> {
|
|
||||||
let vs = collect_field_values(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
distinct.or_else(|| index.distinct_field(txn).unwrap()).unwrap(),
|
|
||||||
docids,
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut unique = HashSet::new();
|
let mut unique = HashSet::new();
|
||||||
for v in vs.iter() {
|
for v in vs.iter() {
|
||||||
@@ -233,49 +223,12 @@ fn verify_distinct(
|
|||||||
fn test_distinct_placeholder_no_ranking_rules() {
|
fn test_distinct_placeholder_no_ranking_rules() {
|
||||||
let index = create_index();
|
let index = create_index();
|
||||||
|
|
||||||
// Set the letter as filterable and unset the distinct attribute.
|
|
||||||
index
|
|
||||||
.update_settings(|s| {
|
|
||||||
s.set_filterable_fields(hashset! { S("letter") });
|
|
||||||
s.reset_distinct_field();
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
|
|
||||||
let mut s = Search::new(&txn, &index);
|
|
||||||
s.distinct(S("letter"));
|
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
|
|
||||||
let distinct_values = verify_distinct(&index, &txn, Some("letter"), &documents_ids);
|
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
|
||||||
[
|
|
||||||
"\"A\"",
|
|
||||||
"\"B\"",
|
|
||||||
"\"C\"",
|
|
||||||
"\"D\"",
|
|
||||||
"\"E\"",
|
|
||||||
"\"F\"",
|
|
||||||
"\"G\"",
|
|
||||||
"\"H\"",
|
|
||||||
"\"I\"",
|
|
||||||
"__does_not_exist__",
|
|
||||||
"__does_not_exist__",
|
|
||||||
"__does_not_exist__",
|
|
||||||
]
|
|
||||||
"###);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_distinct_at_search_placeholder_no_ranking_rules() {
|
|
||||||
let index = create_index();
|
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let s = Search::new(&txn, &index);
|
let s = Search::new(&txn, &index);
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
|
||||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
[
|
[
|
||||||
"\"A\"",
|
"\"A\"",
|
||||||
@@ -310,7 +263,7 @@ fn test_distinct_placeholder_sort() {
|
|||||||
|
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
|
||||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
[
|
[
|
||||||
"\"E\"",
|
"\"E\"",
|
||||||
@@ -350,7 +303,7 @@ fn test_distinct_placeholder_sort() {
|
|||||||
|
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]");
|
||||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
[
|
[
|
||||||
"\"I\"",
|
"\"I\"",
|
||||||
@@ -393,7 +346,7 @@ fn test_distinct_placeholder_sort() {
|
|||||||
|
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]");
|
||||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
[
|
[
|
||||||
"\"I\"",
|
"\"I\"",
|
||||||
@@ -446,7 +399,7 @@ fn test_distinct_words() {
|
|||||||
|
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]");
|
||||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
[
|
[
|
||||||
"\"A\"",
|
"\"A\"",
|
||||||
@@ -500,7 +453,7 @@ fn test_distinct_sort_words() {
|
|||||||
|
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]");
|
||||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
[
|
[
|
||||||
"\"I\"",
|
"\"I\"",
|
||||||
@@ -596,7 +549,7 @@ fn test_distinct_typo() {
|
|||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]");
|
||||||
|
|
||||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
[
|
[
|
||||||
"\"B\"",
|
"\"B\"",
|
||||||
|
|||||||
@@ -0,0 +1,244 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/search/new/tests/attribute_fid.rs
|
||||||
|
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||||
|
---
|
||||||
|
[
|
||||||
|
(
|
||||||
|
2,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 19,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 91,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
6,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 15,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 81,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
5,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 14,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 79,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
4,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 13,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 77,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
3,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 12,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 83,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
9,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 11,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 75,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
8,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 10,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 79,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
7,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 10,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 73,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
11,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 7,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 77,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
10,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 6,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 81,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
13,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 6,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 81,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
12,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 6,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 78,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
14,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 5,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 75,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
0,
|
||||||
|
[
|
||||||
|
Fid(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Position(
|
||||||
|
Rank {
|
||||||
|
rank: 91,
|
||||||
|
max_rank: 91,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -13,7 +13,7 @@ use std::collections::BTreeSet;
|
|||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::{Search, SearchResult, TermsMatchingStrategy};
|
use crate::{db_snap, Search, SearchResult, TermsMatchingStrategy};
|
||||||
|
|
||||||
fn create_index() -> TempIndex {
|
fn create_index() -> TempIndex {
|
||||||
let index = TempIndex::new();
|
let index = TempIndex::new();
|
||||||
@@ -66,10 +66,9 @@ fn create_index() -> TempIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg(not(feature = "swedish-recomposition"))]
|
|
||||||
fn test_stop_words_not_indexed() {
|
fn test_stop_words_not_indexed() {
|
||||||
let index = create_index();
|
let index = create_index();
|
||||||
crate::db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
|
db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/index.rs
|
||||||
|
---
|
||||||
|
age 1 |
|
||||||
|
id 2 |
|
||||||
|
name 2 |
|
||||||
|
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/index.rs
|
||||||
|
---
|
||||||
|
age 1 |
|
||||||
|
id 2 |
|
||||||
|
name 2 |
|
||||||
|
|
||||||
@@ -64,13 +64,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
|||||||
self.index.delete_geo_rtree(self.wtxn)?;
|
self.index.delete_geo_rtree(self.wtxn)?;
|
||||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||||
|
|
||||||
// Remove all user-provided bits from the configs
|
|
||||||
let mut configs = self.index.embedding_configs(self.wtxn)?;
|
|
||||||
for config in configs.iter_mut() {
|
|
||||||
config.user_provided.clear();
|
|
||||||
}
|
|
||||||
self.index.put_embedding_configs(self.wtxn, configs)?;
|
|
||||||
|
|
||||||
// Clear the other databases.
|
// Clear the other databases.
|
||||||
external_documents_ids.clear(self.wtxn)?;
|
external_documents_ids.clear(self.wtxn)?;
|
||||||
word_docids.clear(self.wtxn)?;
|
word_docids.clear(self.wtxn)?;
|
||||||
|
|||||||
@@ -8,19 +8,18 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use bytemuck::cast_slice;
|
use bytemuck::cast_slice;
|
||||||
use grenad::Writer;
|
use grenad::Writer;
|
||||||
|
use itertools::EitherOrBoth;
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::prompt::Prompt;
|
use crate::prompt::Prompt;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
|
use crate::update::index_documents::helpers::try_split_at;
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
|
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
|
||||||
use crate::vector::settings::{EmbedderAction, ReindexAction};
|
|
||||||
use crate::vector::Embedder;
|
use crate::vector::Embedder;
|
||||||
use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort};
|
use crate::{DocumentId, Result, ThreadPoolNoAbort};
|
||||||
|
|
||||||
/// The length of the elements that are always in the buffer when inserting new values.
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||||
@@ -36,8 +35,6 @@ pub struct ExtractedVectorPoints {
|
|||||||
// embedder
|
// embedder
|
||||||
pub embedder_name: String,
|
pub embedder_name: String,
|
||||||
pub embedder: Arc<Embedder>,
|
pub embedder: Arc<Embedder>,
|
||||||
pub add_to_user_provided: RoaringBitmap,
|
|
||||||
pub remove_from_user_provided: RoaringBitmap,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
enum VectorStateDelta {
|
enum VectorStateDelta {
|
||||||
@@ -45,7 +42,12 @@ enum VectorStateDelta {
|
|||||||
// Remove all vectors, generated or manual, from this document
|
// Remove all vectors, generated or manual, from this document
|
||||||
NowRemoved,
|
NowRemoved,
|
||||||
|
|
||||||
NowManual(Vec<Vec<f32>>),
|
// Add the manually specified vectors, passed in the other grenad
|
||||||
|
// Remove any previously generated vectors
|
||||||
|
// Note: changing the value of the manually specified vector **should not record** this delta
|
||||||
|
WasGeneratedNowManual(Vec<Vec<f32>>),
|
||||||
|
|
||||||
|
ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>),
|
||||||
|
|
||||||
// Add the vector computed from the specified prompt
|
// Add the vector computed from the specified prompt
|
||||||
// Remove any previous vector
|
// Remove any previous vector
|
||||||
@@ -54,12 +56,14 @@ enum VectorStateDelta {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl VectorStateDelta {
|
impl VectorStateDelta {
|
||||||
fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
|
fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) {
|
||||||
match self {
|
match self {
|
||||||
VectorStateDelta::NoChange => Default::default(),
|
VectorStateDelta::NoChange => Default::default(),
|
||||||
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
|
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
|
||||||
// We always delete the previous vectors
|
VectorStateDelta::WasGeneratedNowManual(add) => {
|
||||||
VectorStateDelta::NowManual(add) => (true, Default::default(), add),
|
(true, Default::default(), (Default::default(), add))
|
||||||
|
}
|
||||||
|
VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)),
|
||||||
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
|
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -70,27 +74,12 @@ struct EmbedderVectorExtractor {
|
|||||||
embedder: Arc<Embedder>,
|
embedder: Arc<Embedder>,
|
||||||
prompt: Arc<Prompt>,
|
prompt: Arc<Prompt>,
|
||||||
|
|
||||||
|
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||||
|
manual_vectors_writer: Writer<BufWriter<File>>,
|
||||||
// (docid) -> (prompt)
|
// (docid) -> (prompt)
|
||||||
prompts_writer: Writer<BufWriter<File>>,
|
prompts_writer: Writer<BufWriter<File>>,
|
||||||
// (docid) -> ()
|
// (docid) -> ()
|
||||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
|
||||||
manual_vectors_writer: Writer<BufWriter<File>>,
|
|
||||||
// The docids of the documents that contains a user defined embedding
|
|
||||||
add_to_user_provided: RoaringBitmap,
|
|
||||||
|
|
||||||
action: ExtractionAction,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct DocumentOperation {
|
|
||||||
// The docids of the documents that contains an auto-generated embedding
|
|
||||||
remove_from_user_provided: RoaringBitmap,
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ExtractionAction {
|
|
||||||
SettingsFullReindex,
|
|
||||||
SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
|
|
||||||
DocumentOperation(DocumentOperation),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||||
@@ -100,7 +89,6 @@ enum ExtractionAction {
|
|||||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||||
obkv_documents: grenad::Reader<R>,
|
obkv_documents: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
embedders_configs: &[IndexEmbeddingConfig],
|
|
||||||
settings_diff: &InnerIndexSettingsDiff,
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
) -> Result<Vec<ExtractedVectorPoints>> {
|
) -> Result<Vec<ExtractedVectorPoints>> {
|
||||||
let reindex_vectors = settings_diff.reindex_vectors();
|
let reindex_vectors = settings_diff.reindex_vectors();
|
||||||
@@ -109,207 +97,153 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||||
// the vector field id may have changed
|
// the vector field id may have changed
|
||||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||||
|
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||||
|
let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors);
|
||||||
|
|
||||||
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||||
|
|
||||||
let mut extractors = Vec::new();
|
let mut extractors = Vec::new();
|
||||||
|
for (embedder_name, (embedder, prompt)) in
|
||||||
|
settings_diff.new.embedding_configs.clone().into_iter()
|
||||||
|
{
|
||||||
|
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||||
|
let manual_vectors_writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
let mut configs = settings_diff.new.embedding_configs.clone().into_inner();
|
// (docid) -> (prompt)
|
||||||
let old_configs = &settings_diff.old.embedding_configs;
|
let prompts_writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
if reindex_vectors {
|
// (docid) -> ()
|
||||||
for (name, action) in settings_diff.embedding_config_updates.iter() {
|
let remove_vectors_writer = create_writer(
|
||||||
match action {
|
indexer.chunk_compression_type,
|
||||||
EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted
|
indexer.chunk_compression_level,
|
||||||
EmbedderAction::Reindex(action) => {
|
tempfile::tempfile()?,
|
||||||
let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name)
|
);
|
||||||
else {
|
|
||||||
tracing::error!(embedder = name, "Requested embedder config not found");
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
extractors.push(EmbedderVectorExtractor {
|
||||||
let manual_vectors_writer = create_writer(
|
embedder_name,
|
||||||
indexer.chunk_compression_type,
|
embedder,
|
||||||
indexer.chunk_compression_level,
|
prompt,
|
||||||
tempfile::tempfile()?,
|
manual_vectors_writer,
|
||||||
);
|
prompts_writer,
|
||||||
|
remove_vectors_writer,
|
||||||
// (docid) -> (prompt)
|
});
|
||||||
let prompts_writer = create_writer(
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
|
|
||||||
// (docid) -> ()
|
|
||||||
let remove_vectors_writer = create_writer(
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
|
|
||||||
let action = match action {
|
|
||||||
ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
|
|
||||||
ReindexAction::RegeneratePrompts => {
|
|
||||||
let Some((_, old_prompt)) = old_configs.get(name) else {
|
|
||||||
tracing::error!(embedder = name, "Old embedder config not found");
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt }
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
extractors.push(EmbedderVectorExtractor {
|
|
||||||
embedder_name,
|
|
||||||
embedder,
|
|
||||||
prompt,
|
|
||||||
prompts_writer,
|
|
||||||
remove_vectors_writer,
|
|
||||||
manual_vectors_writer,
|
|
||||||
add_to_user_provided: RoaringBitmap::new(),
|
|
||||||
action,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// document operation
|
|
||||||
|
|
||||||
for (embedder_name, (embedder, prompt)) in configs.into_iter() {
|
|
||||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
|
||||||
let manual_vectors_writer = create_writer(
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
|
|
||||||
// (docid) -> (prompt)
|
|
||||||
let prompts_writer = create_writer(
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
|
|
||||||
// (docid) -> ()
|
|
||||||
let remove_vectors_writer = create_writer(
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
|
|
||||||
extractors.push(EmbedderVectorExtractor {
|
|
||||||
embedder_name,
|
|
||||||
embedder,
|
|
||||||
prompt,
|
|
||||||
prompts_writer,
|
|
||||||
remove_vectors_writer,
|
|
||||||
manual_vectors_writer,
|
|
||||||
add_to_user_provided: RoaringBitmap::new(),
|
|
||||||
action: ExtractionAction::DocumentOperation(DocumentOperation {
|
|
||||||
remove_from_user_provided: RoaringBitmap::new(),
|
|
||||||
}),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
// this must always be serialized as (docid, external_docid);
|
// this must always be serialized as (docid, external_docid);
|
||||||
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
|
|
||||||
let (docid_bytes, external_id_bytes) =
|
let (docid_bytes, external_id_bytes) =
|
||||||
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
|
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
|
||||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||||
let docid = DocumentId::from_be_bytes(docid_bytes);
|
|
||||||
|
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(docid_bytes.as_slice());
|
key_buffer.extend_from_slice(docid_bytes);
|
||||||
|
|
||||||
// since we only need the primary key when we throw an error we create this getter to
|
// since we only need the primary key when we throw an error we create this getter to
|
||||||
// lazily get it when needed
|
// lazily get it when needed
|
||||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||||
|
|
||||||
let mut parsed_vectors = ParsedVectorsDiff::new(
|
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid)
|
||||||
docid,
|
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||||
embedders_configs,
|
|
||||||
obkv,
|
|
||||||
old_vectors_fid,
|
|
||||||
new_vectors_fid,
|
|
||||||
)
|
|
||||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
|
||||||
|
|
||||||
for EmbedderVectorExtractor {
|
for EmbedderVectorExtractor {
|
||||||
embedder_name,
|
embedder_name,
|
||||||
embedder: _,
|
embedder: _,
|
||||||
prompt,
|
prompt,
|
||||||
|
manual_vectors_writer,
|
||||||
prompts_writer,
|
prompts_writer,
|
||||||
remove_vectors_writer,
|
remove_vectors_writer,
|
||||||
manual_vectors_writer,
|
|
||||||
add_to_user_provided,
|
|
||||||
action,
|
|
||||||
} in extractors.iter_mut()
|
} in extractors.iter_mut()
|
||||||
{
|
{
|
||||||
let (old, new) = parsed_vectors.remove(embedder_name);
|
let delta = match parsed_vectors.remove(embedder_name) {
|
||||||
let delta = match action {
|
(Some(old), Some(new)) => {
|
||||||
ExtractionAction::SettingsFullReindex => match old {
|
// no autogeneration
|
||||||
// A full reindex can be triggered either by:
|
let del_vectors = old.into_array_of_vectors();
|
||||||
// 1. a new embedder
|
let add_vectors = new.into_array_of_vectors();
|
||||||
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
|
|
||||||
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
|
|
||||||
VectorState::Inline(vectors) => {
|
|
||||||
if !vectors.must_regenerate() {
|
|
||||||
add_to_user_provided.insert(docid);
|
|
||||||
}
|
|
||||||
|
|
||||||
match vectors.into_array_of_vectors() {
|
if add_vectors.len() > usize::from(u8::MAX) {
|
||||||
Some(add_vectors) => {
|
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||||
if add_vectors.len() > usize::from(u8::MAX) {
|
document_id().to_string(),
|
||||||
return Err(crate::Error::UserError(
|
add_vectors.len(),
|
||||||
crate::UserError::TooManyVectors(
|
)));
|
||||||
document_id().to_string(),
|
|
||||||
add_vectors.len(),
|
|
||||||
),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
VectorStateDelta::NowManual(add_vectors)
|
|
||||||
}
|
|
||||||
None => VectorStateDelta::NoChange,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
|
||||||
VectorState::Manual => VectorStateDelta::NoChange,
|
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
|
||||||
// generated vectors must be regenerated
|
}
|
||||||
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?,
|
(Some(_old), None) => {
|
||||||
},
|
// Do we keep this document?
|
||||||
// prompt regeneration is only triggered for existing embedders
|
let document_is_kept = obkv
|
||||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
.iter()
|
||||||
if old.must_regenerate() {
|
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||||
regenerate_if_prompt_changed(
|
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||||
|
if document_is_kept {
|
||||||
|
// becomes autogenerated
|
||||||
|
VectorStateDelta::NowGenerated(prompt.render(
|
||||||
obkv,
|
obkv,
|
||||||
(old_prompt, prompt),
|
DelAdd::Addition,
|
||||||
(old_fields_ids_map, new_fields_ids_map),
|
new_fields_ids_map,
|
||||||
)?
|
)?)
|
||||||
} else {
|
} else {
|
||||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
VectorStateDelta::NowRemoved
|
||||||
// already in the DB since this is an existing embedder
|
}
|
||||||
VectorStateDelta::NoChange
|
}
|
||||||
|
(None, Some(new)) => {
|
||||||
|
// was possibly autogenerated, remove all vectors for that document
|
||||||
|
let add_vectors = new.into_array_of_vectors();
|
||||||
|
if add_vectors.len() > usize::from(u8::MAX) {
|
||||||
|
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||||
|
document_id().to_string(),
|
||||||
|
add_vectors.len(),
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorStateDelta::WasGeneratedNowManual(add_vectors)
|
||||||
|
}
|
||||||
|
(None, None) => {
|
||||||
|
// Do we keep this document?
|
||||||
|
let document_is_kept = obkv
|
||||||
|
.iter()
|
||||||
|
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||||
|
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||||
|
|
||||||
|
if document_is_kept {
|
||||||
|
// Don't give up if the old prompt was failing
|
||||||
|
let old_prompt = Some(&prompt)
|
||||||
|
// TODO: this filter works because we erase the vec database when a embedding setting changes.
|
||||||
|
// When vector pipeline will be optimized, this should be removed.
|
||||||
|
.filter(|_| !settings_diff.reindex_vectors())
|
||||||
|
.map(|p| {
|
||||||
|
p.render(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||||
|
.unwrap_or_default()
|
||||||
|
});
|
||||||
|
let new_prompt =
|
||||||
|
prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||||
|
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||||
|
let old_prompt = old_prompt.unwrap_or_default();
|
||||||
|
tracing::trace!(
|
||||||
|
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||||
|
);
|
||||||
|
VectorStateDelta::NowGenerated(new_prompt)
|
||||||
|
} else {
|
||||||
|
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||||
|
VectorStateDelta::NoChange
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
VectorStateDelta::NowRemoved
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ExtractionAction::DocumentOperation(DocumentOperation {
|
|
||||||
remove_from_user_provided,
|
|
||||||
}) => extract_vector_document_diff(
|
|
||||||
docid,
|
|
||||||
obkv,
|
|
||||||
prompt,
|
|
||||||
(add_to_user_provided, remove_from_user_provided),
|
|
||||||
(old, new),
|
|
||||||
(old_fields_ids_map, new_fields_ids_map),
|
|
||||||
document_id,
|
|
||||||
)?,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// and we finally push the unique vectors into the writer
|
// and we finally push the unique vectors into the writer
|
||||||
push_vectors_diff(
|
push_vectors_diff(
|
||||||
remove_vectors_writer,
|
remove_vectors_writer,
|
||||||
@@ -317,6 +251,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
manual_vectors_writer,
|
manual_vectors_writer,
|
||||||
&mut key_buffer,
|
&mut key_buffer,
|
||||||
delta,
|
delta,
|
||||||
|
reindex_vectors,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -327,185 +262,43 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
embedder_name,
|
embedder_name,
|
||||||
embedder,
|
embedder,
|
||||||
prompt: _,
|
prompt: _,
|
||||||
|
manual_vectors_writer,
|
||||||
prompts_writer,
|
prompts_writer,
|
||||||
remove_vectors_writer,
|
remove_vectors_writer,
|
||||||
action,
|
|
||||||
manual_vectors_writer,
|
|
||||||
add_to_user_provided,
|
|
||||||
} in extractors
|
} in extractors
|
||||||
{
|
{
|
||||||
let remove_from_user_provided =
|
|
||||||
if let ExtractionAction::DocumentOperation(DocumentOperation {
|
|
||||||
remove_from_user_provided,
|
|
||||||
}) = action
|
|
||||||
{
|
|
||||||
remove_from_user_provided
|
|
||||||
} else {
|
|
||||||
Default::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
results.push(ExtractedVectorPoints {
|
results.push(ExtractedVectorPoints {
|
||||||
|
// docid, _index -> KvWriterDelAdd -> Vector
|
||||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||||
|
// docid -> ()
|
||||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||||
|
// docid -> prompt
|
||||||
prompts: writer_into_reader(prompts_writer)?,
|
prompts: writer_into_reader(prompts_writer)?,
|
||||||
|
|
||||||
embedder,
|
embedder,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided,
|
|
||||||
remove_from_user_provided,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(results)
|
Ok(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_vector_document_diff(
|
/// Computes the diff between both Del and Add numbers and
|
||||||
docid: DocumentId,
|
/// only inserts the parts that differ in the sorter.
|
||||||
obkv: obkv::KvReader<'_, FieldId>,
|
|
||||||
prompt: &Prompt,
|
|
||||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
|
||||||
(old, new): (VectorState, VectorState),
|
|
||||||
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
|
||||||
document_id: impl Fn() -> Value,
|
|
||||||
) -> Result<VectorStateDelta> {
|
|
||||||
match (old.must_regenerate(), new.must_regenerate()) {
|
|
||||||
(true, true) | (false, false) => {}
|
|
||||||
(true, false) => {
|
|
||||||
add_to_user_provided.insert(docid);
|
|
||||||
}
|
|
||||||
(false, true) => {
|
|
||||||
remove_from_user_provided.insert(docid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let delta = match (old, new) {
|
|
||||||
// regardless of the previous state, if a document now contains inline _vectors, they must
|
|
||||||
// be extracted manually
|
|
||||||
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
|
|
||||||
Some(add_vectors) => {
|
|
||||||
if add_vectors.len() > usize::from(u8::MAX) {
|
|
||||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
|
||||||
document_id().to_string(),
|
|
||||||
add_vectors.len(),
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
VectorStateDelta::NowManual(add_vectors)
|
|
||||||
}
|
|
||||||
None => VectorStateDelta::NoChange,
|
|
||||||
},
|
|
||||||
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
|
|
||||||
// document changed
|
|
||||||
(VectorState::Generated, VectorState::Generated) => {
|
|
||||||
// Do we keep this document?
|
|
||||||
let document_is_kept = obkv
|
|
||||||
.iter()
|
|
||||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
|
||||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
|
||||||
|
|
||||||
if document_is_kept {
|
|
||||||
// Don't give up if the old prompt was failing
|
|
||||||
let old_prompt = Some(&prompt).map(|p| {
|
|
||||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
|
|
||||||
});
|
|
||||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
|
||||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
|
||||||
let old_prompt = old_prompt.unwrap_or_default();
|
|
||||||
tracing::trace!(
|
|
||||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
|
||||||
);
|
|
||||||
VectorStateDelta::NowGenerated(new_prompt)
|
|
||||||
} else {
|
|
||||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
|
||||||
VectorStateDelta::NoChange
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
VectorStateDelta::NowRemoved
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
|
|
||||||
// the previous version of the document.
|
|
||||||
// Manual -> Generated is also not possible without an Inline to the right (which is handled above)
|
|
||||||
// Generated -> Generated is handled above, so not possible
|
|
||||||
// As a result, this code is unreachable
|
|
||||||
(_not_generated, VectorState::Generated) => {
|
|
||||||
// Do we keep this document?
|
|
||||||
let document_is_kept = obkv
|
|
||||||
.iter()
|
|
||||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
|
||||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
|
||||||
if document_is_kept {
|
|
||||||
// becomes autogenerated
|
|
||||||
VectorStateDelta::NowGenerated(prompt.render(
|
|
||||||
obkv,
|
|
||||||
DelAdd::Addition,
|
|
||||||
new_fields_ids_map,
|
|
||||||
)?)
|
|
||||||
} else {
|
|
||||||
// make sure the document is always removed from user provided on removal
|
|
||||||
remove_from_user_provided.insert(docid);
|
|
||||||
VectorStateDelta::NowRemoved
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
|
|
||||||
// version of the document.
|
|
||||||
// however the Rust type system cannot know that.
|
|
||||||
(_manual, VectorState::Manual) => {
|
|
||||||
// Do we keep this document?
|
|
||||||
let document_is_kept = obkv
|
|
||||||
.iter()
|
|
||||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
|
||||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
|
||||||
if document_is_kept {
|
|
||||||
// if the new version of documents has the vectors in the DB,
|
|
||||||
// then they are user-provided and nothing possibly changed
|
|
||||||
VectorStateDelta::NoChange
|
|
||||||
} else {
|
|
||||||
// make sure the document is always removed from user provided on removal
|
|
||||||
remove_from_user_provided.insert(docid);
|
|
||||||
VectorStateDelta::NowRemoved
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(delta)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn regenerate_if_prompt_changed(
|
|
||||||
obkv: obkv::KvReader<'_, FieldId>,
|
|
||||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
|
||||||
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
|
||||||
) -> Result<VectorStateDelta> {
|
|
||||||
let old_prompt =
|
|
||||||
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
|
|
||||||
let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
|
||||||
|
|
||||||
if new_prompt == old_prompt {
|
|
||||||
return Ok(VectorStateDelta::NoChange);
|
|
||||||
}
|
|
||||||
Ok(VectorStateDelta::NowGenerated(new_prompt))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn regenerate_prompt(
|
|
||||||
obkv: obkv::KvReader<'_, FieldId>,
|
|
||||||
prompt: &Prompt,
|
|
||||||
new_fields_ids_map: &FieldsIdsMap,
|
|
||||||
) -> Result<VectorStateDelta> {
|
|
||||||
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
|
||||||
|
|
||||||
Ok(VectorStateDelta::NowGenerated(prompt))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// We cannot compute the diff between both Del and Add vectors.
|
|
||||||
/// We'll push every vector and compute the difference later in TypedChunk.
|
|
||||||
fn push_vectors_diff(
|
fn push_vectors_diff(
|
||||||
remove_vectors_writer: &mut Writer<BufWriter<File>>,
|
remove_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||||
prompts_writer: &mut Writer<BufWriter<File>>,
|
prompts_writer: &mut Writer<BufWriter<File>>,
|
||||||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||||
key_buffer: &mut Vec<u8>,
|
key_buffer: &mut Vec<u8>,
|
||||||
delta: VectorStateDelta,
|
delta: VectorStateDelta,
|
||||||
|
reindex_vectors: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let (must_remove, prompt, mut add_vectors) = delta.into_values();
|
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
|
||||||
if must_remove {
|
if must_remove
|
||||||
|
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||||
|
// When vector pipeline will be optimized, this should be removed.
|
||||||
|
&& !reindex_vectors
|
||||||
|
{
|
||||||
key_buffer.truncate(TRUNCATE_SIZE);
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||||
}
|
}
|
||||||
@@ -515,22 +308,44 @@ fn push_vectors_diff(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We sort and dedup the vectors
|
// We sort and dedup the vectors
|
||||||
|
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||||
|
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||||
|
|
||||||
|
let merged_vectors_iter =
|
||||||
|
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
|
||||||
|
|
||||||
// insert vectors into the writer
|
// insert vectors into the writer
|
||||||
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
|
||||||
// Generate the key by extending the unique index to it.
|
// Generate the key by extending the unique index to it.
|
||||||
key_buffer.truncate(TRUNCATE_SIZE);
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
let index = u16::try_from(i).unwrap();
|
let index = u16::try_from(i).unwrap();
|
||||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||||
|
|
||||||
// We insert only the Add part of the Obkv to inform
|
match eob {
|
||||||
// that we only want to remove all those vectors.
|
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
EitherOrBoth::Left(vector) => {
|
||||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||||
let bytes = obkv.into_inner()?;
|
// When vector pipeline will be optimized, this should be removed.
|
||||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
if !reindex_vectors {
|
||||||
|
// We insert only the Del part of the Obkv to inform
|
||||||
|
// that we only want to remove all those vectors.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right(vector) => {
|
||||||
|
// We insert only the Add part of the Obkv to inform
|
||||||
|
// that we only want to remove all those vectors.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -8,10 +8,11 @@ mod extract_vector_points;
|
|||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
mod extract_word_position_docids;
|
mod extract_word_position_docids;
|
||||||
|
// mod searchable;
|
||||||
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::sync::{Arc, OnceLock};
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crossbeam_channel::Sender;
|
use crossbeam_channel::Sender;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
@@ -30,9 +31,8 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
|
|||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
|
||||||
|
|
||||||
/// Extract data for each databases from obkv documents in parallel.
|
/// Extract data for each databases from obkv documents in parallel.
|
||||||
/// Send data in grenad file over provided Sender.
|
/// Send data in grenad file over provided Sender.
|
||||||
@@ -44,7 +44,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
|
||||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
@@ -57,7 +56,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
original_documents_chunk,
|
original_documents_chunk,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
embedders_configs.clone(),
|
|
||||||
settings_diff.clone(),
|
settings_diff.clone(),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
@@ -207,47 +205,33 @@ fn run_extraction_task<FE, FS, M>(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn request_threads() -> &'static ThreadPoolNoAbort {
|
|
||||||
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
|
||||||
|
|
||||||
REQUEST_THREADS.get_or_init(|| {
|
|
||||||
ThreadPoolNoAbortBuilder::new()
|
|
||||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
|
||||||
.thread_name(|index| format!("embedding-request-{index}"))
|
|
||||||
.build()
|
|
||||||
.unwrap()
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
/// - documents
|
/// - documents
|
||||||
fn send_original_documents_data(
|
fn send_original_documents_data(
|
||||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
|
||||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let original_documents_chunk =
|
let original_documents_chunk =
|
||||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
|
|
||||||
|
let request_threads = ThreadPoolNoAbortBuilder::new()
|
||||||
|
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||||
|
.thread_name(|index| format!("embedding-request-{index}"))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||||
// no point in indexing vectors without embedders
|
// no point in indexing vectors without embedders
|
||||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||||
|
|
||||||
if index_vectors {
|
if index_vectors {
|
||||||
let settings_diff = settings_diff.clone();
|
let settings_diff = settings_diff.clone();
|
||||||
let embedders_configs = embedders_configs.clone();
|
|
||||||
|
|
||||||
let original_documents_chunk = original_documents_chunk.clone();
|
let original_documents_chunk = original_documents_chunk.clone();
|
||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
match extract_vector_points(
|
match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) {
|
||||||
original_documents_chunk.clone(),
|
|
||||||
indexer,
|
|
||||||
&embedders_configs,
|
|
||||||
&settings_diff,
|
|
||||||
) {
|
|
||||||
Ok(extracted_vectors) => {
|
Ok(extracted_vectors) => {
|
||||||
for ExtractedVectorPoints {
|
for ExtractedVectorPoints {
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
@@ -255,15 +239,13 @@ fn send_original_documents_data(
|
|||||||
prompts,
|
prompts,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
embedder,
|
embedder,
|
||||||
add_to_user_provided,
|
|
||||||
remove_from_user_provided,
|
|
||||||
} in extracted_vectors
|
} in extracted_vectors
|
||||||
{
|
{
|
||||||
let embeddings = match extract_embeddings(
|
let embeddings = match extract_embeddings(
|
||||||
prompts,
|
prompts,
|
||||||
indexer,
|
indexer,
|
||||||
embedder.clone(),
|
embedder.clone(),
|
||||||
request_threads(),
|
&request_threads,
|
||||||
) {
|
) {
|
||||||
Ok(results) => Some(results),
|
Ok(results) => Some(results),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
@@ -281,8 +263,6 @@ fn send_original_documents_data(
|
|||||||
expected_dimension: embedder.dimensions(),
|
expected_dimension: embedder.dimensions(),
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided,
|
|
||||||
remove_from_user_provided,
|
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,211 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use charabia::normalizer::NormalizedTokenIter;
|
||||||
|
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::update::settings::InnerIndexSettings;
|
||||||
|
use crate::{InternalError, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
|
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||||
|
|
||||||
|
pub struct FieldWordPositionExtractorBuilder<'a> {
|
||||||
|
max_positions_per_attributes: u16,
|
||||||
|
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||||
|
separators: Option<Vec<&'a str>>,
|
||||||
|
dictionary: Option<Vec<&'a str>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FieldWordPositionExtractorBuilder<'a> {
|
||||||
|
pub fn new(
|
||||||
|
max_positions_per_attributes: Option<u32>,
|
||||||
|
settings: &'a InnerIndexSettings,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let stop_words = settings.stop_words.as_ref();
|
||||||
|
let separators: Option<Vec<_>> =
|
||||||
|
settings.allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
|
let dictionary: Option<Vec<_>> =
|
||||||
|
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
|
Ok(Self {
|
||||||
|
max_positions_per_attributes: max_positions_per_attributes
|
||||||
|
.map_or(MAX_POSITION_PER_ATTRIBUTE as u16, |max| {
|
||||||
|
max.min(MAX_POSITION_PER_ATTRIBUTE) as u16
|
||||||
|
}),
|
||||||
|
stop_words,
|
||||||
|
separators,
|
||||||
|
dictionary,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(&'a self) -> FieldWordPositionExtractor<'a> {
|
||||||
|
let builder = tokenizer_builder(
|
||||||
|
self.stop_words,
|
||||||
|
self.separators.as_deref(),
|
||||||
|
self.dictionary.as_deref(),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
|
||||||
|
FieldWordPositionExtractor {
|
||||||
|
tokenizer: builder.into_tokenizer(),
|
||||||
|
max_positions_per_attributes: self.max_positions_per_attributes,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FieldWordPositionExtractor<'a> {
|
||||||
|
tokenizer: Tokenizer<'a>,
|
||||||
|
max_positions_per_attributes: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FieldWordPositionExtractor<'a> {
|
||||||
|
pub fn extract<'b>(
|
||||||
|
&'a self,
|
||||||
|
field_bytes: &[u8],
|
||||||
|
buffer: &'b mut String,
|
||||||
|
) -> Result<ExtractedFieldWordPosition<'a, 'b>> {
|
||||||
|
let field_value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||||
|
Ok(ExtractedFieldWordPosition {
|
||||||
|
tokenizer: &self.tokenizer,
|
||||||
|
max_positions_per_attributes: self.max_positions_per_attributes,
|
||||||
|
field_value,
|
||||||
|
buffer: buffer,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ExtractedFieldWordPosition<'a, 'b> {
|
||||||
|
tokenizer: &'a Tokenizer<'a>,
|
||||||
|
max_positions_per_attributes: u16,
|
||||||
|
field_value: Value,
|
||||||
|
buffer: &'b mut String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> ExtractedFieldWordPosition<'a, '_> {
|
||||||
|
pub fn iter<'o>(&'o mut self) -> FieldWordPositionIter<'o> {
|
||||||
|
self.buffer.clear();
|
||||||
|
let inner = match json_to_string(&self.field_value, &mut self.buffer) {
|
||||||
|
Some(field) => Some(self.tokenizer.tokenize(field)),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
// create an iterator of token with their positions.
|
||||||
|
FieldWordPositionIter {
|
||||||
|
inner,
|
||||||
|
max_positions_per_attributes: self.max_positions_per_attributes,
|
||||||
|
position: 0,
|
||||||
|
prev_kind: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FieldWordPositionIter<'a> {
|
||||||
|
inner: Option<NormalizedTokenIter<'a, 'a>>,
|
||||||
|
max_positions_per_attributes: u16,
|
||||||
|
position: u16,
|
||||||
|
prev_kind: Option<TokenKind>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for FieldWordPositionIter<'a> {
|
||||||
|
type Item = (u16, Token<'a>);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.position >= self.max_positions_per_attributes {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let token = self.inner.as_mut().map(|i| i.next()).flatten()?;
|
||||||
|
|
||||||
|
match token.kind {
|
||||||
|
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||||
|
self.position += match self.prev_kind {
|
||||||
|
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||||
|
Some(_) => 1,
|
||||||
|
None => 0,
|
||||||
|
};
|
||||||
|
self.prev_kind = Some(token.kind)
|
||||||
|
}
|
||||||
|
TokenKind::Separator(_) if self.position == 0 => {
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||||
|
self.prev_kind = Some(token.kind);
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Soft)
|
||||||
|
if self.prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||||
|
{
|
||||||
|
self.prev_kind = Some(token.kind);
|
||||||
|
}
|
||||||
|
_ => return self.next(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if !token.is_word() {
|
||||||
|
return self.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep a word only if it is not empty and fit in a LMDB key.
|
||||||
|
let lemma = token.lemma().trim();
|
||||||
|
if !lemma.is_empty() && lemma.len() <= MAX_WORD_LENGTH {
|
||||||
|
Some((self.position, token))
|
||||||
|
} else {
|
||||||
|
self.next()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Factorize tokenizer building.
|
||||||
|
pub fn tokenizer_builder<'a>(
|
||||||
|
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||||
|
allowed_separators: Option<&'a [&str]>,
|
||||||
|
dictionary: Option<&'a [&str]>,
|
||||||
|
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||||
|
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||||
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
|
if let Some(stop_words) = stop_words {
|
||||||
|
tokenizer_builder.stop_words(stop_words);
|
||||||
|
}
|
||||||
|
if let Some(dictionary) = dictionary {
|
||||||
|
tokenizer_builder.words_dict(dictionary);
|
||||||
|
}
|
||||||
|
if let Some(separators) = allowed_separators {
|
||||||
|
tokenizer_builder.separators(separators);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(script_language) = script_language {
|
||||||
|
tokenizer_builder.allow_list(script_language);
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer_builder
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transform a JSON value into a string that can be indexed.
|
||||||
|
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
|
||||||
|
fn inner(value: &Value, output: &mut String) -> bool {
|
||||||
|
use std::fmt::Write;
|
||||||
|
match value {
|
||||||
|
Value::Null | Value::Object(_) => false,
|
||||||
|
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||||
|
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||||
|
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||||
|
Value::Array(array) => {
|
||||||
|
let mut count = 0;
|
||||||
|
for value in array {
|
||||||
|
if inner(value, output) {
|
||||||
|
output.push_str(". ");
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check that at least one value was written
|
||||||
|
count != 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Value::String(string) = value {
|
||||||
|
Some(string)
|
||||||
|
} else if inner(value, buffer) {
|
||||||
|
Some(buffer)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
114
milli/src/update/index_documents/extract/searchable/mod.rs
Normal file
114
milli/src/update/index_documents/extract/searchable/mod.rs
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
use std::collections::{BTreeMap, BTreeSet};
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
|
use field_word_position::FieldWordPositionExtractorBuilder;
|
||||||
|
use obkv::KvReader;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use word_docids::{WordDocidsDump, WordDocidsExtractor};
|
||||||
|
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
|
use crate::update::index_documents::extract::extract_docid_word_positions::ScriptLanguageDocidsMap;
|
||||||
|
use crate::update::index_documents::GrenadParameters;
|
||||||
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
|
use crate::{FieldId, Result, SerializationError};
|
||||||
|
|
||||||
|
mod field_word_position;
|
||||||
|
mod word_docids;
|
||||||
|
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||||
|
pub fn extract_searchable_data<R: io::Read + io::Seek>(
|
||||||
|
obkv_documents: grenad::Reader<R>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
|
max_positions_per_attributes: Option<u32>,
|
||||||
|
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||||
|
let searchable_fields_to_index = settings_diff.searchable_fields_to_index();
|
||||||
|
|
||||||
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
|
|
||||||
|
let add_builder =
|
||||||
|
FieldWordPositionExtractorBuilder::new(max_positions_per_attributes, &settings_diff.new)?;
|
||||||
|
let add_token_positions_extractor = add_builder.build();
|
||||||
|
let del_builder;
|
||||||
|
let del_token_positions_extractor = if settings_diff.settings_update_only {
|
||||||
|
del_builder = FieldWordPositionExtractorBuilder::new(
|
||||||
|
max_positions_per_attributes,
|
||||||
|
&settings_diff.old,
|
||||||
|
)?;
|
||||||
|
del_builder.build()
|
||||||
|
} else {
|
||||||
|
add_builder.build()
|
||||||
|
};
|
||||||
|
let token_positions_extractor = &[del_token_positions_extractor, add_token_positions_extractor];
|
||||||
|
|
||||||
|
let mut word_map = BTreeMap::new();
|
||||||
|
let mut word_docids_extractor = WordDocidsExtractor::new(settings_diff);
|
||||||
|
|
||||||
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
|
// loop over documents
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let document_id = key
|
||||||
|
.try_into()
|
||||||
|
.map(u32::from_be_bytes)
|
||||||
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
|
let obkv = KvReader::<FieldId>::new(value);
|
||||||
|
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||||
|
if !settings_diff.reindex_searchable()
|
||||||
|
&& !searchable_fields_changed(&obkv, &searchable_fields_to_index)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
documents_ids.push(document_id);
|
||||||
|
|
||||||
|
let mut buffer = String::new();
|
||||||
|
for field_id in searchable_fields_to_index.iter() {
|
||||||
|
let Some(field_obkv) = obkv.get(*field_id).map(KvReaderDelAdd::new) else { continue };
|
||||||
|
|
||||||
|
for (deladd, field_bytes) in field_obkv {
|
||||||
|
let mut extracted_positions =
|
||||||
|
token_positions_extractor[deladd as usize].extract(field_bytes, &mut buffer)?;
|
||||||
|
for (position, token) in extracted_positions.iter() {
|
||||||
|
let word = token.lemma().trim();
|
||||||
|
if !word_map.contains_key(word) {
|
||||||
|
word_map.insert(word.to_string(), word_map.len() as u32);
|
||||||
|
}
|
||||||
|
let word_id = word_map.get(word).unwrap();
|
||||||
|
word_docids_extractor.insert(*word_id, *field_id, document_id, deladd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if word_docids_extractor.rough_size_estimate()
|
||||||
|
> indexer.max_memory.map_or(512 * 1024 * 1024, |s| s.min(512 * 1024 * 1024))
|
||||||
|
{
|
||||||
|
let WordDocidsDump { .. } =
|
||||||
|
word_docids_extractor.dump(&word_map, &searchable_fields_to_index, indexer)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if any searchable fields of a document changed.
|
||||||
|
fn searchable_fields_changed(
|
||||||
|
obkv: &KvReader<FieldId>,
|
||||||
|
searchable_fields: &BTreeSet<FieldId>,
|
||||||
|
) -> bool {
|
||||||
|
for field_id in searchable_fields {
|
||||||
|
let Some(field_obkv) = obkv.get(*field_id).map(KvReaderDelAdd::new) else { continue };
|
||||||
|
match (field_obkv.get(DelAdd::Deletion), field_obkv.get(DelAdd::Addition)) {
|
||||||
|
// if both fields are None, check the next field.
|
||||||
|
(None, None) => (),
|
||||||
|
// if both contains a value and values are the same, check the next field.
|
||||||
|
(Some(del), Some(add)) if del == add => (),
|
||||||
|
// otherwise the fields are different, return true.
|
||||||
|
_otherwise => return true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
@@ -0,0 +1,203 @@
|
|||||||
|
use std::collections::hash_map::Entry::{Occupied, Vacant};
|
||||||
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
|
use std::fs::File;
|
||||||
|
use std::hash::Hash;
|
||||||
|
use std::io::BufReader;
|
||||||
|
use std::mem::size_of;
|
||||||
|
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::update::del_add::KvWriterDelAdd;
|
||||||
|
use crate::update::index_documents::extract::searchable::DelAdd;
|
||||||
|
use crate::update::index_documents::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
|
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||||
|
|
||||||
|
pub struct WordDocidsExtractor<'a> {
|
||||||
|
word_fid_docids: RevertedIndex<(u32, FieldId)>,
|
||||||
|
settings_diff: &'a InnerIndexSettingsDiff,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> WordDocidsExtractor<'a> {
|
||||||
|
pub fn new(settings_diff: &'a InnerIndexSettingsDiff) -> Self {
|
||||||
|
Self { word_fid_docids: RevertedIndex::new(), settings_diff }
|
||||||
|
}
|
||||||
|
pub fn insert(&mut self, wordid: u32, fieldid: FieldId, docid: DocumentId, del_add: DelAdd) {
|
||||||
|
self.word_fid_docids.insert((wordid, fieldid), docid, del_add);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn rough_size_estimate(&self) -> usize {
|
||||||
|
self.word_fid_docids.rough_size_estimate()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn dump(
|
||||||
|
&mut self,
|
||||||
|
word_map: &BTreeMap<String, u32>,
|
||||||
|
fields: &BTreeSet<FieldId>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
) -> Result<WordDocidsDump> {
|
||||||
|
let mut word_fid_docids_writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut word_docids_writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut exact_word_docids_writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut exact_word_deletion = RoaringBitmap::new();
|
||||||
|
let mut exact_word_addition = RoaringBitmap::new();
|
||||||
|
let mut word_deletion = RoaringBitmap::new();
|
||||||
|
let mut word_addition = RoaringBitmap::new();
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut bitmap_buffer = Vec::new();
|
||||||
|
let mut obkv_buffer = Vec::new();
|
||||||
|
for (word, wid) in word_map {
|
||||||
|
exact_word_deletion.clear();
|
||||||
|
exact_word_addition.clear();
|
||||||
|
word_deletion.clear();
|
||||||
|
word_addition.clear();
|
||||||
|
for fid in fields {
|
||||||
|
if let Some((deletion, addition)) = self.word_fid_docids.inner.get(&(*wid, *fid)) {
|
||||||
|
if self.settings_diff.old.exact_attributes.contains(&fid) {
|
||||||
|
exact_word_deletion |= deletion;
|
||||||
|
} else {
|
||||||
|
word_deletion |= deletion;
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.settings_diff.new.exact_attributes.contains(&fid) {
|
||||||
|
exact_word_addition |= addition;
|
||||||
|
} else {
|
||||||
|
word_addition |= addition;
|
||||||
|
}
|
||||||
|
|
||||||
|
if deletion != addition {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word.as_bytes());
|
||||||
|
key_buffer.push(0);
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
let value = bitmaps_into_deladd_obkv(
|
||||||
|
deletion,
|
||||||
|
addition,
|
||||||
|
&mut obkv_buffer,
|
||||||
|
&mut bitmap_buffer,
|
||||||
|
)?;
|
||||||
|
word_fid_docids_writer.insert(&key_buffer, value)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word.as_bytes());
|
||||||
|
if exact_word_deletion != exact_word_addition {
|
||||||
|
let value = bitmaps_into_deladd_obkv(
|
||||||
|
&exact_word_deletion,
|
||||||
|
&exact_word_addition,
|
||||||
|
&mut obkv_buffer,
|
||||||
|
&mut bitmap_buffer,
|
||||||
|
)?;
|
||||||
|
exact_word_docids_writer.insert(&key_buffer, value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if word_deletion != word_addition {
|
||||||
|
let value = bitmaps_into_deladd_obkv(
|
||||||
|
&word_deletion,
|
||||||
|
&word_addition,
|
||||||
|
&mut obkv_buffer,
|
||||||
|
&mut bitmap_buffer,
|
||||||
|
)?;
|
||||||
|
word_docids_writer.insert(&key_buffer, value)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.word_fid_docids.clear();
|
||||||
|
|
||||||
|
Ok(WordDocidsDump {
|
||||||
|
word_fid_docids: writer_into_reader(word_fid_docids_writer)?,
|
||||||
|
word_docids: writer_into_reader(word_docids_writer)?,
|
||||||
|
exact_word_docids: writer_into_reader(exact_word_docids_writer)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bitmaps_into_deladd_obkv<'a>(
|
||||||
|
deletion: &RoaringBitmap,
|
||||||
|
addition: &RoaringBitmap,
|
||||||
|
obkv_buffer: &'a mut Vec<u8>,
|
||||||
|
bitmap_buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<&'a mut Vec<u8>> {
|
||||||
|
obkv_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(obkv_buffer);
|
||||||
|
if !deletion.is_empty() {
|
||||||
|
bitmap_buffer.clear();
|
||||||
|
CboRoaringBitmapCodec::serialize_into(deletion, bitmap_buffer);
|
||||||
|
value_writer.insert(DelAdd::Deletion, &*bitmap_buffer)?;
|
||||||
|
}
|
||||||
|
if !addition.is_empty() {
|
||||||
|
bitmap_buffer.clear();
|
||||||
|
CboRoaringBitmapCodec::serialize_into(addition, bitmap_buffer);
|
||||||
|
value_writer.insert(DelAdd::Addition, &*bitmap_buffer)?;
|
||||||
|
}
|
||||||
|
Ok(value_writer.into_inner()?)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct RevertedIndex<K> {
|
||||||
|
inner: HashMap<K, (RoaringBitmap, RoaringBitmap)>,
|
||||||
|
max_value_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<K: PartialEq + Eq + Hash> RevertedIndex<K> {
|
||||||
|
pub fn insert(&mut self, key: K, docid: DocumentId, del_add: DelAdd) {
|
||||||
|
let size = match self.inner.entry(key) {
|
||||||
|
Occupied(mut entry) => {
|
||||||
|
let (ref mut del, ref mut add) = entry.get_mut();
|
||||||
|
match del_add {
|
||||||
|
DelAdd::Deletion => del.insert(docid),
|
||||||
|
DelAdd::Addition => add.insert(docid),
|
||||||
|
};
|
||||||
|
del.serialized_size() + add.serialized_size()
|
||||||
|
}
|
||||||
|
Vacant(entry) => {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
bitmap.insert(docid);
|
||||||
|
let size = bitmap.serialized_size();
|
||||||
|
match del_add {
|
||||||
|
DelAdd::Deletion => entry.insert((bitmap, RoaringBitmap::new())),
|
||||||
|
DelAdd::Addition => entry.insert((RoaringBitmap::new(), bitmap)),
|
||||||
|
};
|
||||||
|
size * 2
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
self.max_value_size = self.max_value_size.max(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self { inner: HashMap::new(), max_value_size: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn rough_size_estimate(&self) -> usize {
|
||||||
|
self.inner.len() * size_of::<K>() + self.inner.len() * self.max_value_size
|
||||||
|
}
|
||||||
|
|
||||||
|
fn clear(&mut self) {
|
||||||
|
self.max_value_size = 0;
|
||||||
|
self.inner.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct WordDocidsDump {
|
||||||
|
pub word_fid_docids: grenad::Reader<BufReader<File>>,
|
||||||
|
pub word_docids: grenad::Reader<BufReader<File>>,
|
||||||
|
pub exact_word_docids: grenad::Reader<BufReader<File>>,
|
||||||
|
}
|
||||||
@@ -286,7 +286,6 @@ where
|
|||||||
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
|
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
|
||||||
|
|
||||||
let settings_diff = Arc::new(settings_diff);
|
let settings_diff = Arc::new(settings_diff);
|
||||||
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
|
|
||||||
|
|
||||||
let backup_pool;
|
let backup_pool;
|
||||||
let pool = match self.indexer_config.thread_pool {
|
let pool = match self.indexer_config.thread_pool {
|
||||||
@@ -400,7 +399,6 @@ where
|
|||||||
pool_params,
|
pool_params,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
embedders_configs.clone(),
|
|
||||||
settings_diff_cloned,
|
settings_diff_cloned,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
)
|
)
|
||||||
@@ -503,8 +501,6 @@ where
|
|||||||
embeddings,
|
embeddings,
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided,
|
|
||||||
remove_from_user_provided,
|
|
||||||
} => {
|
} => {
|
||||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||||
TypedChunk::VectorPoints {
|
TypedChunk::VectorPoints {
|
||||||
@@ -513,8 +509,6 @@ where
|
|||||||
expected_dimension,
|
expected_dimension,
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided,
|
|
||||||
remove_from_user_provided,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
otherwise => otherwise,
|
otherwise => otherwise,
|
||||||
@@ -547,11 +541,10 @@ where
|
|||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
||||||
let writer = arroy::Writer::new(vector_arroy, k, dimension);
|
let writer = arroy::Writer::new(vector_arroy, k, dimension);
|
||||||
if writer.need_build(wtxn)? {
|
if writer.is_empty(wtxn)? {
|
||||||
writer.build(wtxn, &mut rng, None)?;
|
|
||||||
} else if writer.is_empty(wtxn)? {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
writer.build(wtxn, &mut rng, None)?;
|
||||||
}
|
}
|
||||||
Result::Ok(())
|
Result::Ok(())
|
||||||
})
|
})
|
||||||
@@ -788,7 +781,6 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
use crate::documents::documents_batch_reader_from_objects;
|
use crate::documents::documents_batch_reader_from_objects;
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::search::TermsMatchingStrategy;
|
use crate::search::TermsMatchingStrategy;
|
||||||
use crate::update::Setting;
|
use crate::update::Setting;
|
||||||
use crate::{db_snap, Filter, Search};
|
use crate::{db_snap, Filter, Search};
|
||||||
@@ -2624,12 +2616,10 @@ mod tests {
|
|||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
|
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } =
|
let (embedder_name, embedder) = embedding_configs.pop().unwrap();
|
||||||
embedding_configs.pop().unwrap();
|
|
||||||
insta::assert_snapshot!(embedder_name, @"manual");
|
|
||||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
|
|
||||||
let embedder =
|
let embedder =
|
||||||
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
|
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
|
||||||
|
assert_eq!("manual", embedder_name);
|
||||||
let res = index
|
let res = index
|
||||||
.search(&rtxn)
|
.search(&rtxn)
|
||||||
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))
|
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::btree_map::Entry as BEntry;
|
use std::collections::btree_map::Entry as BEntry;
|
||||||
use std::collections::hash_map::Entry as HEntry;
|
use std::collections::hash_map::Entry as HEntry;
|
||||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Read, Seek};
|
use std::io::{Read, Seek};
|
||||||
|
|
||||||
@@ -27,8 +27,6 @@ use crate::update::del_add::{
|
|||||||
use crate::update::index_documents::GrenadParameters;
|
use crate::update::index_documents::GrenadParameters;
|
||||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||||
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
|
||||||
use crate::vector::settings::{EmbedderAction, WriteBackToDocuments};
|
|
||||||
use crate::{
|
use crate::{
|
||||||
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
|
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
|
||||||
};
|
};
|
||||||
@@ -53,6 +51,7 @@ pub struct Transform<'a, 'i> {
|
|||||||
fields_ids_map: FieldsIdsMap,
|
fields_ids_map: FieldsIdsMap,
|
||||||
|
|
||||||
indexer_settings: &'a IndexerConfig,
|
indexer_settings: &'a IndexerConfig,
|
||||||
|
pub autogenerate_docids: bool,
|
||||||
pub index_documents_method: IndexDocumentsMethod,
|
pub index_documents_method: IndexDocumentsMethod,
|
||||||
available_documents_ids: AvailableDocumentsIds,
|
available_documents_ids: AvailableDocumentsIds,
|
||||||
|
|
||||||
@@ -106,7 +105,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
indexer_settings: &'a IndexerConfig,
|
indexer_settings: &'a IndexerConfig,
|
||||||
index_documents_method: IndexDocumentsMethod,
|
index_documents_method: IndexDocumentsMethod,
|
||||||
_autogenerate_docids: bool,
|
autogenerate_docids: bool,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
// We must choose the appropriate merge function for when two or more documents
|
// We must choose the appropriate merge function for when two or more documents
|
||||||
// with the same user id must be merged or fully replaced in the same batch.
|
// with the same user id must be merged or fully replaced in the same batch.
|
||||||
@@ -140,6 +139,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
index,
|
index,
|
||||||
fields_ids_map: index.fields_ids_map(wtxn)?,
|
fields_ids_map: index.fields_ids_map(wtxn)?,
|
||||||
indexer_settings,
|
indexer_settings,
|
||||||
|
autogenerate_docids,
|
||||||
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
|
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
|
||||||
original_sorter,
|
original_sorter,
|
||||||
flattened_sorter,
|
flattened_sorter,
|
||||||
@@ -808,13 +808,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let mut new_inner_settings = old_inner_settings.clone();
|
let mut new_inner_settings = old_inner_settings.clone();
|
||||||
new_inner_settings.fields_ids_map = fields_ids_map;
|
new_inner_settings.fields_ids_map = fields_ids_map;
|
||||||
|
|
||||||
let embedding_config_updates = Default::default();
|
let embedding_configs_updated = false;
|
||||||
let settings_update_only = false;
|
let settings_update_only = false;
|
||||||
let settings_diff = InnerIndexSettingsDiff::new(
|
let settings_diff = InnerIndexSettingsDiff::new(
|
||||||
old_inner_settings,
|
old_inner_settings,
|
||||||
new_inner_settings,
|
new_inner_settings,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
embedding_config_updates,
|
embedding_configs_updated,
|
||||||
settings_update_only,
|
settings_update_only,
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -835,13 +835,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
/// Rebind the field_ids of the provided document to their values
|
/// Rebind the field_ids of the provided document to their values
|
||||||
/// based on the field_ids_maps difference between the old and the new settings,
|
/// based on the field_ids_maps difference between the old and the new settings,
|
||||||
/// then fill the provided buffers with delta documents using KvWritterDelAdd.
|
/// then fill the provided buffers with delta documents using KvWritterDelAdd.
|
||||||
#[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo
|
|
||||||
fn rebind_existing_document(
|
fn rebind_existing_document(
|
||||||
old_obkv: KvReader<FieldId>,
|
old_obkv: KvReader<FieldId>,
|
||||||
settings_diff: &InnerIndexSettingsDiff,
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
modified_faceted_fields: &HashSet<String>,
|
modified_faceted_fields: &HashSet<String>,
|
||||||
mut injected_vectors: serde_json::Map<String, serde_json::Value>,
|
|
||||||
old_vectors_fid: Option<FieldId>,
|
|
||||||
original_obkv_buffer: Option<&mut Vec<u8>>,
|
original_obkv_buffer: Option<&mut Vec<u8>>,
|
||||||
flattened_obkv_buffer: Option<&mut Vec<u8>>,
|
flattened_obkv_buffer: Option<&mut Vec<u8>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
@@ -864,49 +861,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
|
|
||||||
// The operations that we must perform on the different fields.
|
// The operations that we must perform on the different fields.
|
||||||
let mut operations = HashMap::new();
|
let mut operations = HashMap::new();
|
||||||
let mut error_seen = false;
|
|
||||||
|
|
||||||
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
|
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
|
||||||
'write_fid: for (id, val) in old_obkv.iter() {
|
for (id, val) in old_obkv.iter() {
|
||||||
if !injected_vectors.is_empty() {
|
|
||||||
'inject_vectors: {
|
|
||||||
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors };
|
|
||||||
|
|
||||||
if id < vectors_fid {
|
|
||||||
break 'inject_vectors;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut existing_vectors = if id == vectors_fid {
|
|
||||||
let existing_vectors: std::result::Result<
|
|
||||||
serde_json::Map<String, serde_json::Value>,
|
|
||||||
serde_json::Error,
|
|
||||||
> = serde_json::from_slice(val);
|
|
||||||
|
|
||||||
match existing_vectors {
|
|
||||||
Ok(existing_vectors) => existing_vectors,
|
|
||||||
Err(error) => {
|
|
||||||
if !error_seen {
|
|
||||||
tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map");
|
|
||||||
error_seen = true;
|
|
||||||
}
|
|
||||||
Default::default()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Default::default()
|
|
||||||
};
|
|
||||||
|
|
||||||
existing_vectors.append(&mut injected_vectors);
|
|
||||||
|
|
||||||
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
|
|
||||||
obkv_writer
|
|
||||||
.insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?;
|
|
||||||
if id == vectors_fid {
|
|
||||||
continue 'write_fid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
|
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
|
||||||
operations.insert(id, DelAddOperation::DeletionAndAddition);
|
operations.insert(id, DelAddOperation::DeletionAndAddition);
|
||||||
obkv_writer.insert(id, val)?;
|
obkv_writer.insert(id, val)?;
|
||||||
@@ -915,15 +872,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
obkv_writer.insert(id, val)?;
|
obkv_writer.insert(id, val)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !injected_vectors.is_empty() {
|
|
||||||
'inject_vectors: {
|
|
||||||
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors };
|
|
||||||
|
|
||||||
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
|
|
||||||
obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let data = obkv_writer.into_inner()?;
|
let data = obkv_writer.into_inner()?;
|
||||||
let obkv = KvReader::<FieldId>::new(&data);
|
let obkv = KvReader::<FieldId>::new(&data);
|
||||||
|
|
||||||
@@ -989,35 +937,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let readers: Result<
|
|
||||||
BTreeMap<&str, (Vec<arroy::Reader<arroy::distances::Angular>>, &RoaringBitmap)>,
|
|
||||||
> = settings_diff
|
|
||||||
.embedding_config_updates
|
|
||||||
.iter()
|
|
||||||
.filter_map(|(name, action)| {
|
|
||||||
if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
|
|
||||||
embedder_id,
|
|
||||||
user_provided,
|
|
||||||
}) = action
|
|
||||||
{
|
|
||||||
let readers: Result<Vec<_>> =
|
|
||||||
self.index.arroy_readers(wtxn, *embedder_id).collect();
|
|
||||||
match readers {
|
|
||||||
Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))),
|
|
||||||
Err(error) => Some(Err(error)),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
let readers = readers?;
|
|
||||||
|
|
||||||
let old_vectors_fid = settings_diff
|
|
||||||
.old
|
|
||||||
.fields_ids_map
|
|
||||||
.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
|
||||||
|
|
||||||
// We initialize the sorter with the user indexing settings.
|
// We initialize the sorter with the user indexing settings.
|
||||||
let mut flattened_sorter =
|
let mut flattened_sorter =
|
||||||
if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
|
if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
|
||||||
@@ -1044,50 +963,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let injected_vectors: std::result::Result<
|
|
||||||
serde_json::Map<String, serde_json::Value>,
|
|
||||||
arroy::Error,
|
|
||||||
> = readers
|
|
||||||
.iter()
|
|
||||||
.filter_map(|(name, (readers, user_provided))| {
|
|
||||||
if !user_provided.contains(docid) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let mut vectors = Vec::new();
|
|
||||||
for reader in readers {
|
|
||||||
let Some(vector) = reader.item_vector(wtxn, docid).transpose() else {
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
|
|
||||||
match vector {
|
|
||||||
Ok(vector) => vectors.push(vector),
|
|
||||||
Err(error) => return Some(Err(error)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if vectors.is_empty() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
Some(Ok((
|
|
||||||
name.to_string(),
|
|
||||||
serde_json::to_value(ExplicitVectors {
|
|
||||||
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
|
||||||
vectors,
|
|
||||||
)),
|
|
||||||
regenerate: false,
|
|
||||||
})
|
|
||||||
.unwrap(),
|
|
||||||
)))
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let injected_vectors = injected_vectors?;
|
|
||||||
|
|
||||||
Self::rebind_existing_document(
|
Self::rebind_existing_document(
|
||||||
old_obkv,
|
old_obkv,
|
||||||
&settings_diff,
|
&settings_diff,
|
||||||
&modified_faceted_fields,
|
&modified_faceted_fields,
|
||||||
injected_vectors,
|
|
||||||
old_vectors_fid,
|
|
||||||
Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
|
Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
|
||||||
Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()),
|
Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()),
|
||||||
)?;
|
)?;
|
||||||
@@ -1104,23 +983,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut writers = Vec::new();
|
|
||||||
|
|
||||||
// delete all vectors from the embedders that need removal
|
|
||||||
for (_, (readers, _)) in readers {
|
|
||||||
for reader in readers {
|
|
||||||
let dimensions = reader.dimensions();
|
|
||||||
let arroy_index = reader.index();
|
|
||||||
drop(reader);
|
|
||||||
let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions);
|
|
||||||
writers.push(writer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for writer in writers {
|
|
||||||
writer.clear(wtxn)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let grenad_params = GrenadParameters {
|
let grenad_params = GrenadParameters {
|
||||||
chunk_compression_type: self.indexer_settings.chunk_compression_type,
|
chunk_compression_type: self.indexer_settings.chunk_compression_type,
|
||||||
chunk_compression_level: self.indexer_settings.chunk_compression_level,
|
chunk_compression_level: self.indexer_settings.chunk_compression_level,
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ use super::MergeFn;
|
|||||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::index::db_name::DOCUMENTS;
|
use crate::index::db_name::DOCUMENTS;
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::proximity::MAX_DISTANCE;
|
use crate::proximity::MAX_DISTANCE;
|
||||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
@@ -91,8 +90,6 @@ pub(crate) enum TypedChunk {
|
|||||||
expected_dimension: usize,
|
expected_dimension: usize,
|
||||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||||
embedder_name: String,
|
embedder_name: String,
|
||||||
add_to_user_provided: RoaringBitmap,
|
|
||||||
remove_from_user_provided: RoaringBitmap,
|
|
||||||
},
|
},
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||||
}
|
}
|
||||||
@@ -157,11 +154,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let mut docids = index.documents_ids(wtxn)?;
|
let mut docids = index.documents_ids(wtxn)?;
|
||||||
let mut iter = merger.into_stream_merger_iter()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
|
|
||||||
let embedders: BTreeSet<_> = index
|
let embedders: BTreeSet<_> =
|
||||||
.embedding_configs(wtxn)?
|
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
|
||||||
.into_iter()
|
|
||||||
.map(|IndexEmbeddingConfig { name, .. }| name)
|
|
||||||
.collect();
|
|
||||||
let mut vectors_buffer = Vec::new();
|
let mut vectors_buffer = Vec::new();
|
||||||
while let Some((key, reader)) = iter.next()? {
|
while let Some((key, reader)) = iter.next()? {
|
||||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||||
@@ -187,7 +181,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
||||||
break 'vectors Some(addition);
|
break 'vectors Some(addition);
|
||||||
};
|
};
|
||||||
vectors.retain_not_embedded_vectors(&embedders);
|
vectors.retain_user_provided_vectors(&embedders);
|
||||||
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
||||||
if vectors.is_empty() {
|
if vectors.is_empty() {
|
||||||
// skip writing empty `_vectors` map
|
// skip writing empty `_vectors` map
|
||||||
@@ -625,8 +619,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
let mut add_to_user_provided = RoaringBitmap::new();
|
|
||||||
let mut remove_from_user_provided = RoaringBitmap::new();
|
|
||||||
let mut params = None;
|
let mut params = None;
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::VectorPoints {
|
let TypedChunk::VectorPoints {
|
||||||
@@ -635,8 +627,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
embeddings,
|
embeddings,
|
||||||
expected_dimension,
|
expected_dimension,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided: aud,
|
|
||||||
remove_from_user_provided: rud,
|
|
||||||
} = typed_chunk
|
} = typed_chunk
|
||||||
else {
|
else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@@ -649,23 +639,11 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
if let Some(embeddings) = embeddings {
|
if let Some(embeddings) = embeddings {
|
||||||
embeddings_builder.push(embeddings.into_cursor()?);
|
embeddings_builder.push(embeddings.into_cursor()?);
|
||||||
}
|
}
|
||||||
add_to_user_provided |= aud;
|
|
||||||
remove_from_user_provided |= rud;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// typed chunks has always at least 1 chunk.
|
// typed chunks has always at least 1 chunk.
|
||||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||||
|
|
||||||
let mut embedding_configs = index.embedding_configs(wtxn)?;
|
|
||||||
let index_embedder_config = embedding_configs
|
|
||||||
.iter_mut()
|
|
||||||
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
|
|
||||||
.unwrap();
|
|
||||||
index_embedder_config.user_provided -= remove_from_user_provided;
|
|
||||||
index_embedder_config.user_provided |= add_to_user_provided;
|
|
||||||
|
|
||||||
index.put_embedding_configs(wtxn, embedding_configs)?;
|
|
||||||
|
|
||||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||||
)?;
|
)?;
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ use std::sync::Arc;
|
|||||||
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||||
use deserr::{DeserializeError, Deserr};
|
use deserr::{DeserializeError, Deserr};
|
||||||
use itertools::{EitherOrBoth, Itertools};
|
use itertools::{EitherOrBoth, Itertools};
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
@@ -15,18 +14,12 @@ use super::index_documents::{IndexDocumentsConfig, Transform};
|
|||||||
use super::IndexerConfig;
|
use super::IndexerConfig;
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
use crate::index::{
|
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||||
IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
|
|
||||||
};
|
|
||||||
use crate::order_by_map::OrderByMap;
|
use crate::order_by_map::OrderByMap;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::index_documents::IndexDocumentsMethod;
|
use crate::update::index_documents::IndexDocumentsMethod;
|
||||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||||
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
|
||||||
use crate::vector::settings::{
|
|
||||||
check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction,
|
|
||||||
WriteBackToDocuments,
|
|
||||||
};
|
|
||||||
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
||||||
use crate::{FieldId, FieldsIdsMap, Index, Result};
|
use crate::{FieldId, FieldsIdsMap, Index, Result};
|
||||||
|
|
||||||
@@ -497,7 +490,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
self.index.put_all_searchable_fields_from_fields_ids_map(
|
self.index.put_all_searchable_fields_from_fields_ids_map(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
&names,
|
&names,
|
||||||
&fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME),
|
|
||||||
&fields_ids_map,
|
&fields_ids_map,
|
||||||
)?;
|
)?;
|
||||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||||
@@ -927,177 +919,92 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
Ok(changed)
|
Ok(changed)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> {
|
fn update_embedding_configs(&mut self) -> Result<bool> {
|
||||||
match std::mem::take(&mut self.embedder_settings) {
|
let update = match std::mem::take(&mut self.embedder_settings) {
|
||||||
Setting::Set(configs) => self.update_embedding_configs_set(configs),
|
Setting::Set(configs) => {
|
||||||
Setting::Reset => {
|
let mut changed = false;
|
||||||
// all vectors should be written back to documents
|
|
||||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||||
let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs
|
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> =
|
||||||
|
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
|
||||||
|
|
||||||
|
let mut new_configs = BTreeMap::new();
|
||||||
|
for joined in old_configs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> {
|
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
|
||||||
let embedder_id =
|
{
|
||||||
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
|
match joined {
|
||||||
crate::InternalError::DatabaseMissingEntry {
|
// updated config
|
||||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
EitherOrBoth::Both((name, mut old), (_, new)) => {
|
||||||
key: None,
|
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
|
||||||
},
|
if changed {
|
||||||
)?;
|
tracing::debug!(embedder = name, "need reindex");
|
||||||
Ok((
|
} else {
|
||||||
name,
|
tracing::debug!(embedder = name, "skip reindex");
|
||||||
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
|
}
|
||||||
embedder_id,
|
let new = validate_embedding_settings(old, &name)?;
|
||||||
user_provided,
|
new_configs.insert(name, new);
|
||||||
}),
|
}
|
||||||
))
|
// unchanged config
|
||||||
|
EitherOrBoth::Left((name, setting)) => {
|
||||||
|
new_configs.insert(name, setting);
|
||||||
|
}
|
||||||
|
// new config
|
||||||
|
EitherOrBoth::Right((name, mut setting)) => {
|
||||||
|
// apply the default source in case the source was not set so that it gets validated
|
||||||
|
crate::vector::settings::EmbeddingSettings::apply_default_source(
|
||||||
|
&mut setting,
|
||||||
|
);
|
||||||
|
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
|
||||||
|
&mut setting,
|
||||||
|
);
|
||||||
|
let setting = validate_embedding_settings(setting, &name)?;
|
||||||
|
changed = true;
|
||||||
|
new_configs.insert(name, setting);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|(name, setting)| match setting {
|
||||||
|
Setting::Set(value) => Some((name, value.into())),
|
||||||
|
Setting::Reset => None,
|
||||||
|
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let remove_all = remove_all?;
|
|
||||||
|
|
||||||
self.index.embedder_category_id.clear(self.wtxn)?;
|
self.index.embedder_category_id.clear(self.wtxn)?;
|
||||||
|
for (index, (embedder_name, _)) in new_configs.iter().enumerate() {
|
||||||
|
self.index.embedder_category_id.put_with_flags(
|
||||||
|
self.wtxn,
|
||||||
|
heed::PutFlags::APPEND,
|
||||||
|
embedder_name,
|
||||||
|
&index
|
||||||
|
.try_into()
|
||||||
|
.map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if new_configs.is_empty() {
|
||||||
|
self.index.delete_embedding_configs(self.wtxn)?;
|
||||||
|
} else {
|
||||||
|
self.index.put_embedding_configs(self.wtxn, new_configs)?;
|
||||||
|
}
|
||||||
|
changed
|
||||||
|
}
|
||||||
|
Setting::Reset => {
|
||||||
self.index.delete_embedding_configs(self.wtxn)?;
|
self.index.delete_embedding_configs(self.wtxn)?;
|
||||||
Ok(remove_all)
|
true
|
||||||
}
|
}
|
||||||
Setting::NotSet => Ok(Default::default()),
|
Setting::NotSet => false,
|
||||||
}
|
};
|
||||||
}
|
|
||||||
|
|
||||||
fn update_embedding_configs_set(
|
// if any changes force a reindexing
|
||||||
&mut self,
|
// clear the vector database.
|
||||||
configs: BTreeMap<String, Setting<EmbeddingSettings>>,
|
if update {
|
||||||
) -> Result<BTreeMap<String, EmbedderAction>> {
|
self.index.vector_arroy.clear(self.wtxn)?;
|
||||||
use crate::vector::settings::SettingsDiff;
|
}
|
||||||
|
|
||||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
Ok(update)
|
||||||
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
|
|
||||||
.into_iter()
|
|
||||||
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
|
|
||||||
(name, (config.into(), user_provided))
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
let mut updated_configs = BTreeMap::new();
|
|
||||||
let mut embedder_actions = BTreeMap::new();
|
|
||||||
for joined in old_configs
|
|
||||||
.into_iter()
|
|
||||||
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
|
|
||||||
{
|
|
||||||
match joined {
|
|
||||||
// updated config
|
|
||||||
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
|
|
||||||
let settings_diff = SettingsDiff::from_settings(old, new);
|
|
||||||
match settings_diff {
|
|
||||||
SettingsDiff::Remove => {
|
|
||||||
tracing::debug!(
|
|
||||||
embedder = name,
|
|
||||||
user_provided = user_provided.len(),
|
|
||||||
"removing embedder"
|
|
||||||
);
|
|
||||||
let embedder_id =
|
|
||||||
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
|
|
||||||
crate::InternalError::DatabaseMissingEntry {
|
|
||||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
|
||||||
key: None,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
// free id immediately
|
|
||||||
self.index.embedder_category_id.delete(self.wtxn, &name)?;
|
|
||||||
embedder_actions.insert(
|
|
||||||
name,
|
|
||||||
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
|
|
||||||
embedder_id,
|
|
||||||
user_provided,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
SettingsDiff::Reindex { action, updated_settings } => {
|
|
||||||
tracing::debug!(
|
|
||||||
embedder = name,
|
|
||||||
user_provided = user_provided.len(),
|
|
||||||
?action,
|
|
||||||
"reindex embedder"
|
|
||||||
);
|
|
||||||
embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action));
|
|
||||||
let new =
|
|
||||||
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
|
|
||||||
updated_configs.insert(name, (new, user_provided));
|
|
||||||
}
|
|
||||||
SettingsDiff::UpdateWithoutReindex { updated_settings } => {
|
|
||||||
tracing::debug!(
|
|
||||||
embedder = name,
|
|
||||||
user_provided = user_provided.len(),
|
|
||||||
"update without reindex embedder"
|
|
||||||
);
|
|
||||||
let new =
|
|
||||||
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
|
|
||||||
updated_configs.insert(name, (new, user_provided));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// unchanged config
|
|
||||||
EitherOrBoth::Left((name, (setting, user_provided))) => {
|
|
||||||
tracing::debug!(embedder = name, "unchanged embedder");
|
|
||||||
updated_configs.insert(name, (Setting::Set(setting), user_provided));
|
|
||||||
}
|
|
||||||
// new config
|
|
||||||
EitherOrBoth::Right((name, mut setting)) => {
|
|
||||||
tracing::debug!(embedder = name, "new embedder");
|
|
||||||
// apply the default source in case the source was not set so that it gets validated
|
|
||||||
crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting);
|
|
||||||
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
|
|
||||||
&mut setting,
|
|
||||||
);
|
|
||||||
let setting = validate_embedding_settings(setting, &name)?;
|
|
||||||
embedder_actions
|
|
||||||
.insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex));
|
|
||||||
updated_configs.insert(name, (setting, RoaringBitmap::new()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
|
|
||||||
for res in self.index.embedder_category_id.iter(self.wtxn)? {
|
|
||||||
let (_name, id) = res?;
|
|
||||||
free_indices[id as usize] = false;
|
|
||||||
}
|
|
||||||
let mut free_indices = free_indices.iter_mut().enumerate();
|
|
||||||
let mut find_free_index =
|
|
||||||
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
|
|
||||||
for (name, action) in embedder_actions.iter() {
|
|
||||||
match action {
|
|
||||||
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => {
|
|
||||||
/* cannot be a new embedder, so has to have an id already */
|
|
||||||
}
|
|
||||||
EmbedderAction::Reindex(ReindexAction::FullReindex) => {
|
|
||||||
if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() {
|
|
||||||
let id = find_free_index()
|
|
||||||
.ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
|
|
||||||
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
|
|
||||||
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|(name, (config, user_provided))| match config {
|
|
||||||
Setting::Set(config) => {
|
|
||||||
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
|
|
||||||
}
|
|
||||||
Setting::Reset => None,
|
|
||||||
Setting::NotSet => Some(IndexEmbeddingConfig {
|
|
||||||
name,
|
|
||||||
config: EmbeddingSettings::default().into(),
|
|
||||||
user_provided,
|
|
||||||
}),
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
if updated_configs.is_empty() {
|
|
||||||
self.index.delete_embedding_configs(self.wtxn)?;
|
|
||||||
} else {
|
|
||||||
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
|
|
||||||
}
|
|
||||||
Ok(embedder_actions)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_search_cutoff(&mut self) -> Result<bool> {
|
fn update_search_cutoff(&mut self) -> Result<bool> {
|
||||||
@@ -1151,8 +1058,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
self.update_searchable()?;
|
self.update_searchable()?;
|
||||||
self.update_exact_attributes()?;
|
self.update_exact_attributes()?;
|
||||||
self.update_proximity_precision()?;
|
self.update_proximity_precision()?;
|
||||||
|
// TODO: very rough approximation of the needs for reindexing where any change will result in
|
||||||
let embedding_config_updates = self.update_embedding_configs()?;
|
// a full reindexing.
|
||||||
|
// What can be done instead:
|
||||||
|
// 1. Only change the distance on a distance change
|
||||||
|
// 2. Only change the name -> embedder mapping on a name change
|
||||||
|
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
|
||||||
|
let embedding_configs_updated = self.update_embedding_configs()?;
|
||||||
|
|
||||||
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
||||||
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
|
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
|
||||||
@@ -1166,7 +1078,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
old_inner_settings,
|
old_inner_settings,
|
||||||
new_inner_settings,
|
new_inner_settings,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
embedding_config_updates,
|
embedding_configs_updated,
|
||||||
settings_update_only,
|
settings_update_only,
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -1182,7 +1094,8 @@ pub struct InnerIndexSettingsDiff {
|
|||||||
pub(crate) old: InnerIndexSettings,
|
pub(crate) old: InnerIndexSettings,
|
||||||
pub(crate) new: InnerIndexSettings,
|
pub(crate) new: InnerIndexSettings,
|
||||||
pub(crate) primary_key_id: Option<FieldId>,
|
pub(crate) primary_key_id: Option<FieldId>,
|
||||||
pub(crate) embedding_config_updates: BTreeMap<String, EmbedderAction>,
|
// TODO: compare directly the embedders.
|
||||||
|
pub(crate) embedding_configs_updated: bool,
|
||||||
pub(crate) settings_update_only: bool,
|
pub(crate) settings_update_only: bool,
|
||||||
/// The set of only the additional searchable fields.
|
/// The set of only the additional searchable fields.
|
||||||
/// If any other searchable field has been modified, is set to None.
|
/// If any other searchable field has been modified, is set to None.
|
||||||
@@ -1203,7 +1116,7 @@ impl InnerIndexSettingsDiff {
|
|||||||
old_settings: InnerIndexSettings,
|
old_settings: InnerIndexSettings,
|
||||||
new_settings: InnerIndexSettings,
|
new_settings: InnerIndexSettings,
|
||||||
primary_key_id: Option<FieldId>,
|
primary_key_id: Option<FieldId>,
|
||||||
embedding_config_updates: BTreeMap<String, EmbedderAction>,
|
embedding_configs_updated: bool,
|
||||||
settings_update_only: bool,
|
settings_update_only: bool,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let only_additional_fields = match (
|
let only_additional_fields = match (
|
||||||
@@ -1240,7 +1153,7 @@ impl InnerIndexSettingsDiff {
|
|||||||
old: old_settings,
|
old: old_settings,
|
||||||
new: new_settings,
|
new: new_settings,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
embedding_config_updates,
|
embedding_configs_updated,
|
||||||
settings_update_only,
|
settings_update_only,
|
||||||
only_additional_fields,
|
only_additional_fields,
|
||||||
cache_reindex_searchable_without_user_defined,
|
cache_reindex_searchable_without_user_defined,
|
||||||
@@ -1249,6 +1162,18 @@ impl InnerIndexSettingsDiff {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn searchable_fields_to_index(&self) -> BTreeSet<FieldId> {
|
||||||
|
if self.settings_update_only {
|
||||||
|
self.new
|
||||||
|
.fields_ids_map
|
||||||
|
.ids()
|
||||||
|
.filter(|id| self.reindex_searchable_id(*id).is_some())
|
||||||
|
.collect()
|
||||||
|
} else {
|
||||||
|
self.new.searchable_fields_ids.iter().copied().collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn any_reindexing_needed(&self) -> bool {
|
pub fn any_reindexing_needed(&self) -> bool {
|
||||||
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
||||||
}
|
}
|
||||||
@@ -1307,7 +1232,7 @@ impl InnerIndexSettingsDiff {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn reindex_vectors(&self) -> bool {
|
pub fn reindex_vectors(&self) -> bool {
|
||||||
!self.embedding_config_updates.is_empty()
|
self.embedding_configs_updated
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn settings_update_only(&self) -> bool {
|
pub fn settings_update_only(&self) -> bool {
|
||||||
@@ -1339,8 +1264,6 @@ pub(crate) struct InnerIndexSettings {
|
|||||||
pub embedding_configs: EmbeddingConfigs,
|
pub embedding_configs: EmbeddingConfigs,
|
||||||
pub existing_fields: HashSet<String>,
|
pub existing_fields: HashSet<String>,
|
||||||
pub geo_fields_ids: Option<(FieldId, FieldId)>,
|
pub geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
pub non_searchable_fields_ids: Vec<FieldId>,
|
|
||||||
pub non_faceted_fields_ids: Vec<FieldId>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl InnerIndexSettings {
|
impl InnerIndexSettings {
|
||||||
@@ -1354,8 +1277,8 @@ impl InnerIndexSettings {
|
|||||||
let user_defined_searchable_fields =
|
let user_defined_searchable_fields =
|
||||||
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
|
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
|
||||||
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
|
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
|
||||||
let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
|
let searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
|
||||||
let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
|
let faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
|
||||||
let exact_attributes = index.exact_attributes_ids(rtxn)?;
|
let exact_attributes = index.exact_attributes_ids(rtxn)?;
|
||||||
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
|
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
|
||||||
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
|
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
|
||||||
@@ -1383,10 +1306,6 @@ impl InnerIndexSettings {
|
|||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME);
|
|
||||||
searchable_fields_ids.retain(|id| !vectors_fids.contains(id));
|
|
||||||
faceted_fields_ids.retain(|id| !vectors_fids.contains(id));
|
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
stop_words,
|
stop_words,
|
||||||
allowed_separators,
|
allowed_separators,
|
||||||
@@ -1401,8 +1320,6 @@ impl InnerIndexSettings {
|
|||||||
embedding_configs,
|
embedding_configs,
|
||||||
existing_fields,
|
existing_fields,
|
||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
non_searchable_fields_ids: vectors_fids.clone(),
|
|
||||||
non_faceted_fields_ids: vectors_fids.clone(),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1410,10 +1327,9 @@ impl InnerIndexSettings {
|
|||||||
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
|
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
|
||||||
let new_facets = self
|
let new_facets = self
|
||||||
.fields_ids_map
|
.fields_ids_map
|
||||||
.iter()
|
.names()
|
||||||
.filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid))
|
.filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields))
|
||||||
.filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields))
|
.map(|field| field.to_string())
|
||||||
.map(|(_fid, field)| field.to_string())
|
|
||||||
.collect();
|
.collect();
|
||||||
index.put_faceted_fields(wtxn, &new_facets)?;
|
index.put_faceted_fields(wtxn, &new_facets)?;
|
||||||
|
|
||||||
@@ -1433,7 +1349,6 @@ impl InnerIndexSettings {
|
|||||||
index.put_all_searchable_fields_from_fields_ids_map(
|
index.put_all_searchable_fields_from_fields_ids_map(
|
||||||
wtxn,
|
wtxn,
|
||||||
&searchable_fields,
|
&searchable_fields,
|
||||||
&self.non_searchable_fields_ids,
|
|
||||||
&self.fields_ids_map,
|
&self.fields_ids_map,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@@ -1444,25 +1359,19 @@ impl InnerIndexSettings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<EmbeddingConfigs> {
|
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
|
||||||
let res: Result<_> = embedding_configs
|
let res: Result<_> = embedding_configs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(
|
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
|
||||||
|IndexEmbeddingConfig {
|
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
||||||
name,
|
|
||||||
config: EmbeddingConfig { embedder_options, prompt },
|
|
||||||
..
|
|
||||||
}| {
|
|
||||||
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
|
||||||
|
|
||||||
let embedder = Arc::new(
|
let embedder = Arc::new(
|
||||||
Embedder::new(embedder_options.clone())
|
Embedder::new(embedder_options.clone())
|
||||||
.map_err(crate::vector::Error::from)
|
.map_err(crate::vector::Error::from)
|
||||||
.map_err(crate::Error::from)?,
|
.map_err(crate::Error::from)?,
|
||||||
);
|
);
|
||||||
Ok((name, (embedder, prompt)))
|
Ok((name, (embedder, prompt)))
|
||||||
},
|
})
|
||||||
)
|
|
||||||
.collect();
|
.collect();
|
||||||
res.map(EmbeddingConfigs::new)
|
res.map(EmbeddingConfigs::new)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,10 +152,6 @@ impl EmbeddingConfigs {
|
|||||||
&self.0
|
&self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_inner(self) -> HashMap<String, (Arc<Embedder>, Arc<Prompt>)> {
|
|
||||||
self.0
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the name of the default embedder configuration.
|
/// Get the name of the default embedder configuration.
|
||||||
///
|
///
|
||||||
/// The default embedder is determined as follows:
|
/// The default embedder is determined as follows:
|
||||||
|
|||||||
@@ -1,119 +1,51 @@
|
|||||||
use std::collections::{BTreeMap, BTreeSet};
|
use std::collections::{BTreeMap, BTreeSet};
|
||||||
|
|
||||||
use deserr::{take_cf_content, DeserializeError, Deserr, Sequence};
|
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
|
|
||||||
use super::Embedding;
|
use super::Embedding;
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::{DocumentId, FieldId, InternalError, UserError};
|
use crate::{FieldId, InternalError, UserError};
|
||||||
|
|
||||||
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
|
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
|
||||||
|
|
||||||
#[derive(serde::Serialize, Debug)]
|
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub enum Vectors {
|
pub enum Vectors {
|
||||||
ImplicitlyUserProvided(VectorOrArrayOfVectors),
|
ImplicitlyUserProvided(VectorOrArrayOfVectors),
|
||||||
Explicit(ExplicitVectors),
|
Explicit(ExplicitVectors),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<E: DeserializeError> Deserr<E> for Vectors {
|
|
||||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
|
||||||
value: deserr::Value<V>,
|
|
||||||
location: deserr::ValuePointerRef,
|
|
||||||
) -> Result<Self, E> {
|
|
||||||
match value {
|
|
||||||
deserr::Value::Sequence(_) | deserr::Value::Null => {
|
|
||||||
Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value(
|
|
||||||
value, location,
|
|
||||||
)?))
|
|
||||||
}
|
|
||||||
deserr::Value::Map(_) => {
|
|
||||||
Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?))
|
|
||||||
}
|
|
||||||
|
|
||||||
value => Err(take_cf_content(E::error(
|
|
||||||
None,
|
|
||||||
deserr::ErrorKind::IncorrectValueKind {
|
|
||||||
actual: value,
|
|
||||||
accepted: &[
|
|
||||||
deserr::ValueKind::Sequence,
|
|
||||||
deserr::ValueKind::Map,
|
|
||||||
deserr::ValueKind::Null,
|
|
||||||
],
|
|
||||||
},
|
|
||||||
location,
|
|
||||||
))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Vectors {
|
impl Vectors {
|
||||||
pub fn must_regenerate(&self) -> bool {
|
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
|
||||||
match self {
|
match self {
|
||||||
Vectors::ImplicitlyUserProvided(_) => false,
|
Vectors::ImplicitlyUserProvided(embeddings)
|
||||||
Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate,
|
| Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => {
|
||||||
}
|
embeddings.into_array_of_vectors().unwrap_or_default()
|
||||||
}
|
|
||||||
|
|
||||||
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
|
|
||||||
match self {
|
|
||||||
Vectors::ImplicitlyUserProvided(embeddings) => {
|
|
||||||
Some(embeddings.into_array_of_vectors().unwrap_or_default())
|
|
||||||
}
|
|
||||||
Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => {
|
|
||||||
embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(serde::Serialize, Deserr, Debug)]
|
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ExplicitVectors {
|
pub struct ExplicitVectors {
|
||||||
#[serde(default)]
|
pub embeddings: VectorOrArrayOfVectors,
|
||||||
#[deserr(default)]
|
pub user_provided: bool,
|
||||||
pub embeddings: Option<VectorOrArrayOfVectors>,
|
|
||||||
pub regenerate: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum VectorState {
|
|
||||||
Inline(Vectors),
|
|
||||||
Manual,
|
|
||||||
Generated,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl VectorState {
|
|
||||||
pub fn must_regenerate(&self) -> bool {
|
|
||||||
match self {
|
|
||||||
VectorState::Inline(vectors) => vectors.must_regenerate(),
|
|
||||||
VectorState::Manual => false,
|
|
||||||
VectorState::Generated => true,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum VectorsState {
|
|
||||||
NoVectorsFid,
|
|
||||||
NoVectorsFieldInDocument,
|
|
||||||
Vectors(BTreeMap<String, Vectors>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ParsedVectorsDiff {
|
pub struct ParsedVectorsDiff {
|
||||||
old: BTreeMap<String, VectorState>,
|
pub old: Option<BTreeMap<String, Vectors>>,
|
||||||
new: VectorsState,
|
pub new: Option<BTreeMap<String, Vectors>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ParsedVectorsDiff {
|
impl ParsedVectorsDiff {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
docid: DocumentId,
|
|
||||||
embedders_configs: &[IndexEmbeddingConfig],
|
|
||||||
documents_diff: KvReader<'_, FieldId>,
|
documents_diff: KvReader<'_, FieldId>,
|
||||||
old_vectors_fid: Option<FieldId>,
|
old_vectors_fid: Option<FieldId>,
|
||||||
new_vectors_fid: Option<FieldId>,
|
new_vectors_fid: Option<FieldId>,
|
||||||
) -> Result<Self, Error> {
|
) -> Result<Self, Error> {
|
||||||
let mut old = match old_vectors_fid
|
let old = match old_vectors_fid
|
||||||
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
|
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
|
||||||
.map(KvReaderDelAdd::new)
|
.map(KvReaderDelAdd::new)
|
||||||
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
|
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
|
||||||
@@ -129,84 +61,48 @@ impl ParsedVectorsDiff {
|
|||||||
return Err(error);
|
return Err(error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
|
.flatten();
|
||||||
for embedding_config in embedders_configs {
|
let new = new_vectors_fid
|
||||||
if embedding_config.user_provided.contains(docid) {
|
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
|
||||||
old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual);
|
.map(KvReaderDelAdd::new)
|
||||||
}
|
.map(|obkv| to_vector_map(obkv, DelAdd::Addition))
|
||||||
}
|
.transpose()?
|
||||||
|
.flatten();
|
||||||
let new = 'new: {
|
|
||||||
let Some(new_vectors_fid) = new_vectors_fid else {
|
|
||||||
break 'new VectorsState::NoVectorsFid;
|
|
||||||
};
|
|
||||||
let Some(bytes) = documents_diff.get(new_vectors_fid) else {
|
|
||||||
break 'new VectorsState::NoVectorsFieldInDocument;
|
|
||||||
};
|
|
||||||
let obkv = KvReaderDelAdd::new(bytes);
|
|
||||||
match to_vector_map(obkv, DelAdd::Addition)? {
|
|
||||||
Some(new) => VectorsState::Vectors(new),
|
|
||||||
None => VectorsState::NoVectorsFieldInDocument,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Self { old, new })
|
Ok(Self { old, new })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) {
|
pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) {
|
||||||
let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated);
|
let old = self.old.as_mut().and_then(|old| old.remove(embedder_name));
|
||||||
let state_from_old = match old {
|
let new = self.new.as_mut().and_then(|new| new.remove(embedder_name));
|
||||||
// assume a userProvided is still userProvided
|
|
||||||
VectorState::Manual => VectorState::Manual,
|
|
||||||
// generated is still generated
|
|
||||||
VectorState::Generated => VectorState::Generated,
|
|
||||||
// weird case that shouldn't happen were the previous docs version is inline,
|
|
||||||
// but it was removed in the new version
|
|
||||||
// Since it is not in the new version, we switch to generated
|
|
||||||
VectorState::Inline(_) => VectorState::Generated,
|
|
||||||
};
|
|
||||||
let new = match &mut self.new {
|
|
||||||
VectorsState::Vectors(new) => {
|
|
||||||
new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old)
|
|
||||||
}
|
|
||||||
_ =>
|
|
||||||
// if no `_vectors` field is present in the new document,
|
|
||||||
// the state depends on the previous version of the document
|
|
||||||
{
|
|
||||||
state_from_old
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
(old, new)
|
(old, new)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
|
pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
|
||||||
|
|
||||||
impl<E: DeserializeError> Deserr<E> for ParsedVectors {
|
|
||||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
|
||||||
value: deserr::Value<V>,
|
|
||||||
location: deserr::ValuePointerRef,
|
|
||||||
) -> Result<Self, E> {
|
|
||||||
let value = <BTreeMap<String, Vectors>>::deserialize_from_value(value, location)?;
|
|
||||||
Ok(ParsedVectors(value))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ParsedVectors {
|
impl ParsedVectors {
|
||||||
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
|
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
|
||||||
let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?;
|
let Ok(value) = from_slice(value) else {
|
||||||
deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error })
|
let value = from_slice(value).map_err(Error::InternalSerdeJson)?;
|
||||||
|
return Err(Error::InvalidMap(value));
|
||||||
|
};
|
||||||
|
Ok(ParsedVectors(value))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
|
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
|
||||||
self.0.retain(|k, _v| !embedders.contains(k))
|
self.0.retain(|k, v| match v {
|
||||||
|
Vectors::ImplicitlyUserProvided(_) => true,
|
||||||
|
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
|
||||||
|
*user_provided
|
||||||
|
// if the embedder is not in the config, then never touch it
|
||||||
|
|| !embedders.contains(k)
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
InvalidMap(Value),
|
InvalidMap(Value),
|
||||||
InvalidEmbedderConf { error: deserr::errors::JsonError },
|
|
||||||
InternalSerdeJson(serde_json::Error),
|
InternalSerdeJson(serde_json::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -216,12 +112,6 @@ impl Error {
|
|||||||
Error::InvalidMap(value) => {
|
Error::InvalidMap(value) => {
|
||||||
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
|
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
|
||||||
}
|
}
|
||||||
Error::InvalidEmbedderConf { error } => {
|
|
||||||
crate::Error::UserError(UserError::InvalidVectorsEmbedderConf {
|
|
||||||
document_id,
|
|
||||||
error,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
Error::InternalSerdeJson(error) => {
|
Error::InternalSerdeJson(error) => {
|
||||||
crate::Error::InternalError(InternalError::SerdeJson(error))
|
crate::Error::InternalError(InternalError::SerdeJson(error))
|
||||||
}
|
}
|
||||||
@@ -242,84 +132,13 @@ fn to_vector_map(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Represents either a vector or an array of multiple vectors.
|
/// Represents either a vector or an array of multiple vectors.
|
||||||
#[derive(serde::Serialize, Debug)]
|
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||||
#[serde(transparent)]
|
#[serde(transparent)]
|
||||||
pub struct VectorOrArrayOfVectors {
|
pub struct VectorOrArrayOfVectors {
|
||||||
#[serde(with = "either::serde_untagged_optional")]
|
#[serde(with = "either::serde_untagged_optional")]
|
||||||
inner: Option<either::Either<Vec<Embedding>, Embedding>>,
|
inner: Option<either::Either<Vec<Embedding>, Embedding>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<E: DeserializeError> Deserr<E> for VectorOrArrayOfVectors {
|
|
||||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
|
||||||
value: deserr::Value<V>,
|
|
||||||
location: deserr::ValuePointerRef,
|
|
||||||
) -> Result<Self, E> {
|
|
||||||
match value {
|
|
||||||
deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }),
|
|
||||||
deserr::Value::Sequence(seq) => {
|
|
||||||
let mut iter = seq.into_iter();
|
|
||||||
match iter.next().map(|v| v.into_value()) {
|
|
||||||
None => {
|
|
||||||
// With the strange way serde serialize the `Either`, we must send the left part
|
|
||||||
// otherwise it'll consider we returned [[]]
|
|
||||||
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) })
|
|
||||||
}
|
|
||||||
Some(val @ deserr::Value::Sequence(_)) => {
|
|
||||||
let first = Embedding::deserialize_from_value(val, location.push_index(0))?;
|
|
||||||
let mut collect = vec![first];
|
|
||||||
let mut tail = iter
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, v)| {
|
|
||||||
Embedding::deserialize_from_value(
|
|
||||||
v.into_value(),
|
|
||||||
location.push_index(i + 1),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
|
||||||
collect.append(&mut tail);
|
|
||||||
|
|
||||||
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) })
|
|
||||||
}
|
|
||||||
Some(
|
|
||||||
val @ deserr::Value::Integer(_)
|
|
||||||
| val @ deserr::Value::NegativeInteger(_)
|
|
||||||
| val @ deserr::Value::Float(_),
|
|
||||||
) => {
|
|
||||||
let first = <f32>::deserialize_from_value(val, location.push_index(0))?;
|
|
||||||
let mut embedding = iter
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, v)| {
|
|
||||||
<f32>::deserialize_from_value(
|
|
||||||
v.into_value(),
|
|
||||||
location.push_index(i + 1),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
|
||||||
embedding.insert(0, first);
|
|
||||||
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) })
|
|
||||||
}
|
|
||||||
Some(value) => Err(take_cf_content(E::error(
|
|
||||||
None,
|
|
||||||
deserr::ErrorKind::IncorrectValueKind {
|
|
||||||
actual: value,
|
|
||||||
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float],
|
|
||||||
},
|
|
||||||
location.push_index(0),
|
|
||||||
))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
value => Err(take_cf_content(E::error(
|
|
||||||
None,
|
|
||||||
deserr::ErrorKind::IncorrectValueKind {
|
|
||||||
actual: value,
|
|
||||||
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null],
|
|
||||||
},
|
|
||||||
location,
|
|
||||||
))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl VectorOrArrayOfVectors {
|
impl VectorOrArrayOfVectors {
|
||||||
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
|
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
|
||||||
match self.inner? {
|
match self.inner? {
|
||||||
@@ -331,41 +150,21 @@ impl VectorOrArrayOfVectors {
|
|||||||
pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self {
|
pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self {
|
||||||
Self { inner: Some(either::Either::Left(array_of_vec)) }
|
Self { inner: Some(either::Either::Left(array_of_vec)) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_vector(vec: Embedding) -> Self {
|
|
||||||
Self { inner: Some(either::Either::Right(vec)) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Embedding> for VectorOrArrayOfVectors {
|
|
||||||
fn from(vec: Embedding) -> Self {
|
|
||||||
Self::from_vector(vec)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Vec<Embedding>> for VectorOrArrayOfVectors {
|
|
||||||
fn from(vec: Vec<Embedding>) -> Self {
|
|
||||||
Self::from_array_of_vectors(vec)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::VectorOrArrayOfVectors;
|
use super::VectorOrArrayOfVectors;
|
||||||
|
|
||||||
fn embedding_from_str(s: &str) -> Result<VectorOrArrayOfVectors, deserr::errors::JsonError> {
|
|
||||||
let value: serde_json::Value = serde_json::from_str(s).unwrap();
|
|
||||||
deserr::deserialize(value)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn array_of_vectors() {
|
fn array_of_vectors() {
|
||||||
let null = embedding_from_str("null").unwrap();
|
let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap();
|
||||||
let empty = embedding_from_str("[]").unwrap();
|
let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap();
|
||||||
let one = embedding_from_str("[0.1]").unwrap();
|
let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap();
|
||||||
let two = embedding_from_str("[0.1, 0.2]").unwrap();
|
let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap();
|
||||||
let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap();
|
let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap();
|
||||||
let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
|
let two_vecs: VectorOrArrayOfVectors =
|
||||||
|
serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
|
||||||
|
|
||||||
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
|
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
|
||||||
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");
|
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use deserr::Deserr;
|
use deserr::Deserr;
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use super::rest::InputType;
|
use super::rest::InputType;
|
||||||
@@ -73,238 +72,6 @@ pub fn check_unset<T>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Indicates what action should take place during a reindexing operation for an embedder
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
||||||
pub enum ReindexAction {
|
|
||||||
/// An indexing operation should take place for this embedder, keeping existing vectors
|
|
||||||
/// and checking whether the document template changed or not
|
|
||||||
RegeneratePrompts,
|
|
||||||
/// An indexing operation should take place for all documents for this embedder, removing existing vectors
|
|
||||||
/// (except userProvided ones)
|
|
||||||
FullReindex,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum SettingsDiff {
|
|
||||||
Remove,
|
|
||||||
Reindex { action: ReindexAction, updated_settings: EmbeddingSettings },
|
|
||||||
UpdateWithoutReindex { updated_settings: EmbeddingSettings },
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum EmbedderAction {
|
|
||||||
WriteBackToDocuments(WriteBackToDocuments),
|
|
||||||
Reindex(ReindexAction),
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct WriteBackToDocuments {
|
|
||||||
pub embedder_id: u8,
|
|
||||||
pub user_provided: RoaringBitmap,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SettingsDiff {
|
|
||||||
pub fn from_settings(old: EmbeddingSettings, new: Setting<EmbeddingSettings>) -> Self {
|
|
||||||
match new {
|
|
||||||
Setting::Set(new) => {
|
|
||||||
let EmbeddingSettings {
|
|
||||||
mut source,
|
|
||||||
mut model,
|
|
||||||
mut revision,
|
|
||||||
mut api_key,
|
|
||||||
mut dimensions,
|
|
||||||
mut document_template,
|
|
||||||
mut url,
|
|
||||||
mut query,
|
|
||||||
mut input_field,
|
|
||||||
mut path_to_embeddings,
|
|
||||||
mut embedding_object,
|
|
||||||
mut input_type,
|
|
||||||
mut distribution,
|
|
||||||
} = old;
|
|
||||||
|
|
||||||
let EmbeddingSettings {
|
|
||||||
source: new_source,
|
|
||||||
model: new_model,
|
|
||||||
revision: new_revision,
|
|
||||||
api_key: new_api_key,
|
|
||||||
dimensions: new_dimensions,
|
|
||||||
document_template: new_document_template,
|
|
||||||
url: new_url,
|
|
||||||
query: new_query,
|
|
||||||
input_field: new_input_field,
|
|
||||||
path_to_embeddings: new_path_to_embeddings,
|
|
||||||
embedding_object: new_embedding_object,
|
|
||||||
input_type: new_input_type,
|
|
||||||
distribution: new_distribution,
|
|
||||||
} = new;
|
|
||||||
|
|
||||||
let mut reindex_action = None;
|
|
||||||
|
|
||||||
// **Warning**: do not use short-circuiting || here, we want all these operations applied
|
|
||||||
if source.apply(new_source) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
// when the source changes, we need to reapply the default settings for the new source
|
|
||||||
apply_default_for_source(
|
|
||||||
&source,
|
|
||||||
&mut model,
|
|
||||||
&mut revision,
|
|
||||||
&mut dimensions,
|
|
||||||
&mut url,
|
|
||||||
&mut query,
|
|
||||||
&mut input_field,
|
|
||||||
&mut path_to_embeddings,
|
|
||||||
&mut embedding_object,
|
|
||||||
&mut input_type,
|
|
||||||
&mut document_template,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
if model.apply(new_model) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if revision.apply(new_revision) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if dimensions.apply(new_dimensions) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if url.apply(new_url) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if query.apply(new_query) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if input_field.apply(new_input_field) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if path_to_embeddings.apply(new_path_to_embeddings) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if embedding_object.apply(new_embedding_object) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if input_type.apply(new_input_type) {
|
|
||||||
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
|
|
||||||
}
|
|
||||||
if document_template.apply(new_document_template) {
|
|
||||||
ReindexAction::push_action(
|
|
||||||
&mut reindex_action,
|
|
||||||
ReindexAction::RegeneratePrompts,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
distribution.apply(new_distribution);
|
|
||||||
api_key.apply(new_api_key);
|
|
||||||
|
|
||||||
let updated_settings = EmbeddingSettings {
|
|
||||||
source,
|
|
||||||
model,
|
|
||||||
revision,
|
|
||||||
api_key,
|
|
||||||
dimensions,
|
|
||||||
document_template,
|
|
||||||
url,
|
|
||||||
query,
|
|
||||||
input_field,
|
|
||||||
path_to_embeddings,
|
|
||||||
embedding_object,
|
|
||||||
input_type,
|
|
||||||
distribution,
|
|
||||||
};
|
|
||||||
|
|
||||||
match reindex_action {
|
|
||||||
Some(action) => Self::Reindex { action, updated_settings },
|
|
||||||
None => Self::UpdateWithoutReindex { updated_settings },
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Setting::Reset => Self::Remove,
|
|
||||||
Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old },
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ReindexAction {
|
|
||||||
fn push_action(this: &mut Option<Self>, other: Self) {
|
|
||||||
*this = match (*this, other) {
|
|
||||||
(_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex),
|
|
||||||
(Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex),
|
|
||||||
(_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)] // private function
|
|
||||||
fn apply_default_for_source(
|
|
||||||
source: &Setting<EmbedderSource>,
|
|
||||||
model: &mut Setting<String>,
|
|
||||||
revision: &mut Setting<String>,
|
|
||||||
dimensions: &mut Setting<usize>,
|
|
||||||
url: &mut Setting<String>,
|
|
||||||
query: &mut Setting<serde_json::Value>,
|
|
||||||
input_field: &mut Setting<Vec<String>>,
|
|
||||||
path_to_embeddings: &mut Setting<Vec<String>>,
|
|
||||||
embedding_object: &mut Setting<Vec<String>>,
|
|
||||||
input_type: &mut Setting<InputType>,
|
|
||||||
document_template: &mut Setting<String>,
|
|
||||||
) {
|
|
||||||
match source {
|
|
||||||
Setting::Set(EmbedderSource::HuggingFace) => {
|
|
||||||
*model = Setting::Reset;
|
|
||||||
*revision = Setting::Reset;
|
|
||||||
*dimensions = Setting::NotSet;
|
|
||||||
*url = Setting::NotSet;
|
|
||||||
*query = Setting::NotSet;
|
|
||||||
*input_field = Setting::NotSet;
|
|
||||||
*path_to_embeddings = Setting::NotSet;
|
|
||||||
*embedding_object = Setting::NotSet;
|
|
||||||
*input_type = Setting::NotSet;
|
|
||||||
}
|
|
||||||
Setting::Set(EmbedderSource::Ollama) => {
|
|
||||||
*model = Setting::Reset;
|
|
||||||
*revision = Setting::NotSet;
|
|
||||||
*dimensions = Setting::Reset;
|
|
||||||
*url = Setting::NotSet;
|
|
||||||
*query = Setting::NotSet;
|
|
||||||
*input_field = Setting::NotSet;
|
|
||||||
*path_to_embeddings = Setting::NotSet;
|
|
||||||
*embedding_object = Setting::NotSet;
|
|
||||||
*input_type = Setting::NotSet;
|
|
||||||
}
|
|
||||||
Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => {
|
|
||||||
*model = Setting::Reset;
|
|
||||||
*revision = Setting::NotSet;
|
|
||||||
*dimensions = Setting::NotSet;
|
|
||||||
*url = Setting::NotSet;
|
|
||||||
*query = Setting::NotSet;
|
|
||||||
*input_field = Setting::NotSet;
|
|
||||||
*path_to_embeddings = Setting::NotSet;
|
|
||||||
*embedding_object = Setting::NotSet;
|
|
||||||
*input_type = Setting::NotSet;
|
|
||||||
}
|
|
||||||
Setting::Set(EmbedderSource::Rest) => {
|
|
||||||
*model = Setting::NotSet;
|
|
||||||
*revision = Setting::NotSet;
|
|
||||||
*dimensions = Setting::Reset;
|
|
||||||
*url = Setting::Reset;
|
|
||||||
*query = Setting::Reset;
|
|
||||||
*input_field = Setting::Reset;
|
|
||||||
*path_to_embeddings = Setting::Reset;
|
|
||||||
*embedding_object = Setting::Reset;
|
|
||||||
*input_type = Setting::Reset;
|
|
||||||
}
|
|
||||||
Setting::Set(EmbedderSource::UserProvided) => {
|
|
||||||
*model = Setting::NotSet;
|
|
||||||
*revision = Setting::NotSet;
|
|
||||||
*dimensions = Setting::Reset;
|
|
||||||
*url = Setting::NotSet;
|
|
||||||
*query = Setting::NotSet;
|
|
||||||
*input_field = Setting::NotSet;
|
|
||||||
*path_to_embeddings = Setting::NotSet;
|
|
||||||
*embedding_object = Setting::NotSet;
|
|
||||||
*input_type = Setting::NotSet;
|
|
||||||
*document_template = Setting::NotSet;
|
|
||||||
}
|
|
||||||
Setting::NotSet => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn check_set<T>(
|
pub fn check_set<T>(
|
||||||
key: &Setting<T>,
|
key: &Setting<T>,
|
||||||
field: &'static str,
|
field: &'static str,
|
||||||
@@ -443,6 +210,66 @@ impl EmbeddingSettings {
|
|||||||
*model = Setting::Set(openai::EmbeddingModel::default().name().to_owned())
|
*model = Setting::Set(openai::EmbeddingModel::default().name().to_owned())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn apply_and_need_reindex(
|
||||||
|
old: &mut Setting<EmbeddingSettings>,
|
||||||
|
new: Setting<EmbeddingSettings>,
|
||||||
|
) -> bool {
|
||||||
|
match (old, new) {
|
||||||
|
(
|
||||||
|
Setting::Set(EmbeddingSettings {
|
||||||
|
source: old_source,
|
||||||
|
model: old_model,
|
||||||
|
revision: old_revision,
|
||||||
|
api_key: old_api_key,
|
||||||
|
dimensions: old_dimensions,
|
||||||
|
document_template: old_document_template,
|
||||||
|
url: old_url,
|
||||||
|
query: old_query,
|
||||||
|
input_field: old_input_field,
|
||||||
|
path_to_embeddings: old_path_to_embeddings,
|
||||||
|
embedding_object: old_embedding_object,
|
||||||
|
input_type: old_input_type,
|
||||||
|
distribution: old_distribution,
|
||||||
|
}),
|
||||||
|
Setting::Set(EmbeddingSettings {
|
||||||
|
source: new_source,
|
||||||
|
model: new_model,
|
||||||
|
revision: new_revision,
|
||||||
|
api_key: new_api_key,
|
||||||
|
dimensions: new_dimensions,
|
||||||
|
document_template: new_document_template,
|
||||||
|
url: new_url,
|
||||||
|
query: new_query,
|
||||||
|
input_field: new_input_field,
|
||||||
|
path_to_embeddings: new_path_to_embeddings,
|
||||||
|
embedding_object: new_embedding_object,
|
||||||
|
input_type: new_input_type,
|
||||||
|
distribution: new_distribution,
|
||||||
|
}),
|
||||||
|
) => {
|
||||||
|
let mut needs_reindex = false;
|
||||||
|
|
||||||
|
needs_reindex |= old_source.apply(new_source);
|
||||||
|
needs_reindex |= old_model.apply(new_model);
|
||||||
|
needs_reindex |= old_revision.apply(new_revision);
|
||||||
|
needs_reindex |= old_dimensions.apply(new_dimensions);
|
||||||
|
needs_reindex |= old_document_template.apply(new_document_template);
|
||||||
|
needs_reindex |= old_url.apply(new_url);
|
||||||
|
needs_reindex |= old_query.apply(new_query);
|
||||||
|
needs_reindex |= old_input_field.apply(new_input_field);
|
||||||
|
needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings);
|
||||||
|
needs_reindex |= old_embedding_object.apply(new_embedding_object);
|
||||||
|
needs_reindex |= old_input_type.apply(new_input_type);
|
||||||
|
|
||||||
|
old_distribution.apply(new_distribution);
|
||||||
|
old_api_key.apply(new_api_key);
|
||||||
|
needs_reindex
|
||||||
|
}
|
||||||
|
(Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false,
|
||||||
|
_ => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [
|
|||||||
"stream",
|
"stream",
|
||||||
"json",
|
"json",
|
||||||
"rustls-tls",
|
"rustls-tls",
|
||||||
], default-features = false }
|
], default_features = false }
|
||||||
serde = { version = "1.0.195", features = ["derive"] }
|
serde = { version = "1.0.195", features = ["derive"] }
|
||||||
serde_json = "1.0.111"
|
serde_json = "1.0.111"
|
||||||
sha2 = "0.10.8"
|
sha2 = "0.10.8"
|
||||||
|
|||||||
Reference in New Issue
Block a user