Compare commits

..

1 Commits

Author SHA1 Message Date
40215bfec3 TMP: check windows free disk space 2024-05-29 11:27:27 +02:00
104 changed files with 1693 additions and 7154 deletions

View File

@ -56,6 +56,12 @@ jobs:
matrix: matrix:
os: [macos-12, windows-2022] os: [macos-12, windows-2022]
steps: steps:
- name: Check free disk space on C
run: |
fsutil volume diskfree c:
- name: Check free disk space on D
run: |
fsutil volume diskfree d:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Cache dependencies - name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.1 uses: Swatinem/rust-cache@v2.7.1
@ -63,11 +69,23 @@ jobs:
with: with:
toolchain: stable toolchain: stable
override: true override: true
- name: Check free disk space on C
run: |
fsutil volume diskfree c:
- name: Check free disk space on D
run: |
fsutil volume diskfree d:
- name: Run cargo check without any default features - name: Run cargo check without any default features
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
command: build command: build
args: --locked --release --no-default-features --all args: --locked --release --no-default-features --all
- name: Check free disk space on C
run: |
fsutil volume diskfree c:
- name: Check free disk space on D
run: |
fsutil volume diskfree d:
- name: Run cargo test - name: Run cargo test
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:
@ -116,7 +134,7 @@ jobs:
override: true override: true
- name: Run cargo tree without default features and check lindera is not present - name: Run cargo tree without default features and check lindera is not present
run: | run: |
if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -qz lindera; then if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -vqz lindera; then
echo "lindera has been found in the sources and it shouldn't" echo "lindera has been found in the sources and it shouldn't"
exit 1 exit 1
fi fi

503
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@ members = [
] ]
[workspace.package] [workspace.package]
version = "1.9.0" version = "1.8.0"
authors = [ authors = [
"Quentin de Quelen <quentin@dequelen.me>", "Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>", "Clément Renault <clement@meilisearch.com>",

View File

@ -25,7 +25,7 @@
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p> <p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
[Meilisearch](https://www.meilisearch.com) helps you shape a delightful search experience in a snap, offering features that work out of the box to speed up your workflow. Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
<p align="center" name="demo"> <p align="center" name="demo">
<a href="https://where2watch.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=demo-gif#gh-light-mode-only" target="_blank"> <a href="https://where2watch.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=demo-gif#gh-light-mode-only" target="_blank">
@ -39,8 +39,8 @@
🔥 [**Try it!**](https://where2watch.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=demo-link) 🔥 🔥 [**Try it!**](https://where2watch.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=demo-link) 🔥
## ✨ Features ## ✨ Features
- **Hybrid search:** Combine the best of both [semantic](https://www.meilisearch.com/docs/learn/experimental/vector_search) & full-text search to get the most relevant results
- **Search-as-you-type:** find & display results in less than 50 milliseconds to provide an intuitive experience - **Search-as-you-type:** find search results in less than 50 milliseconds
- **[Typo tolerance](https://www.meilisearch.com/docs/learn/configuration/typo_tolerance?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** get relevant matches even when queries contain typos and misspellings - **[Typo tolerance](https://www.meilisearch.com/docs/learn/configuration/typo_tolerance?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** get relevant matches even when queries contain typos and misspellings
- **[Filtering](https://www.meilisearch.com/docs/learn/fine_tuning_results/filtering?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features) and [faceted search](https://www.meilisearch.com/docs/learn/fine_tuning_results/faceted_search?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** enhance your users' search experience with custom filters and build a faceted search interface in a few lines of code - **[Filtering](https://www.meilisearch.com/docs/learn/fine_tuning_results/filtering?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features) and [faceted search](https://www.meilisearch.com/docs/learn/fine_tuning_results/faceted_search?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** enhance your users' search experience with custom filters and build a faceted search interface in a few lines of code
- **[Sorting](https://www.meilisearch.com/docs/learn/fine_tuning_results/sorting?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** sort results based on price, date, or pretty much anything else your users need - **[Sorting](https://www.meilisearch.com/docs/learn/fine_tuning_results/sorting?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** sort results based on price, date, or pretty much anything else your users need
@ -55,15 +55,15 @@
## đź“– Documentation ## đź“– Documentation
You can consult Meilisearch's documentation at [meilisearch.com/docs](https://www.meilisearch.com/docs/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=docs). You can consult Meilisearch's documentation at [https://www.meilisearch.com/docs](https://www.meilisearch.com/docs/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=docs).
## 🚀 Getting started ## 🚀 Getting started
For basic instructions on how to set up Meilisearch, add documents to an index, and search for documents, take a look at our [Quick Start](https://www.meilisearch.com/docs/learn/getting_started/quick_start?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=get-started) guide. For basic instructions on how to set up Meilisearch, add documents to an index, and search for documents, take a look at our [Quick Start](https://www.meilisearch.com/docs/learn/getting_started/quick_start?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=get-started) guide.
## 🌍 Supercharge your Meilisearch experience ## ⚡ Supercharge your Meilisearch experience
Say goodbye to server deployment and manual updates with [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Additional features include analytics & monitoring in many regions around the world. No credit card is required. Say goodbye to server deployment and manual updates with [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). No credit card required.
## đź§° SDKs & integration tools ## đź§° SDKs & integration tools
@ -85,13 +85,13 @@ Finally, for more in-depth information, refer to our articles explaining fundame
Meilisearch collects **anonymized** data from users to help us improve our product. You can [deactivate this](https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=telemetry#how-to-disable-data-collection) whenever you want. Meilisearch collects **anonymized** data from users to help us improve our product. You can [deactivate this](https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=telemetry#how-to-disable-data-collection) whenever you want.
To request deletion of collected data, please write to us at [privacy@meilisearch.com](mailto:privacy@meilisearch.com). Remember to include your `Instance UID` in the message, as this helps us quickly find and delete your data. To request deletion of collected data, please write to us at [privacy@meilisearch.com](mailto:privacy@meilisearch.com). Don't forget to include your `Instance UID` in the message, as this helps us quickly find and delete your data.
If you want to know more about the kind of data we collect and what we use it for, check the [telemetry section](https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=telemetry#how-to-disable-data-collection) of our documentation. If you want to know more about the kind of data we collect and what we use it for, check the [telemetry section](https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=telemetry#how-to-disable-data-collection) of our documentation.
## đź“« Get in touch! ## đź“« Get in touch!
Meilisearch is a search engine created by [Meili]([https://www.welcometothejungle.com/en/companies/meilisearch](https://www.meilisearch.com/careers)), a software development company headquartered in France and with team members all over the world. Want to know more about us? [Check out our blog!](https://blog.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=contact) Meilisearch is a search engine created by [Meili](https://www.welcometothejungle.com/en/companies/meilisearch), a software development company based in France and with team members all over the world. Want to know more about us? [Check out our blog!](https://blog.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=contact)
đź—ž [Subscribe to our newsletter](https://meilisearch.us2.list-manage.com/subscribe?u=27870f7b71c908a8b359599fb&id=79582d828e) if you don't want to miss any updates! We promise we won't clutter your mailbox: we only send one edition every two months. đź—ž [Subscribe to our newsletter](https://meilisearch.us2.list-manage.com/subscribe?u=27870f7b71c908a8b359599fb&id=79582d828e) if you don't want to miss any updates! We promise we won't clutter your mailbox: we only send one edition every two months.

View File

@ -780,7 +780,7 @@ expression: document
1.3484878540039063 1.3484878540039063
] ]
], ],
"regenerate": true "userProvided": false
} }
} }
} }

View File

@ -779,7 +779,7 @@ expression: document
1.04031240940094 1.04031240940094
] ]
], ],
"regenerate": true "userProvided": false
} }
} }
} }

View File

@ -152,7 +152,6 @@ impl Settings<Unchecked> {
} }
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]

View File

@ -182,7 +182,6 @@ impl Settings<Unchecked> {
} }
} }
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
#[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]

View File

@ -200,7 +200,6 @@ impl std::ops::Deref for IndexUid {
} }
} }
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug)] #[derive(Debug)]
#[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, derive(serde::Serialize))]
#[cfg_attr(test, serde(rename_all = "camelCase"))] #[cfg_attr(test, serde(rename_all = "camelCase"))]

View File

@ -40,9 +40,7 @@ ureq = "2.9.7"
uuid = { version = "1.6.1", features = ["serde", "v4"] } uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies] [dev-dependencies]
arroy = "0.4.0"
big_s = "1.0.2" big_s = "1.0.2"
crossbeam = "0.8.4" crossbeam = "0.8.4"
insta = { version = "1.34.0", features = ["json", "redactions"] } insta = { version = "1.34.0", features = ["json", "redactions"] }
maplit = "1.0.2"
meili-snap = { path = "../meili-snap" } meili-snap = { path = "../meili-snap" }

View File

@ -909,7 +909,6 @@ impl IndexScheduler {
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index.embedding_configs(&rtxn)?;
// 3.1. Dump the documents // 3.1. Dump the documents
for ret in index.all_documents(&rtxn)? { for ret in index.all_documents(&rtxn)? {
@ -952,21 +951,16 @@ impl IndexScheduler {
}; };
for (embedder_name, embeddings) in embeddings { for (embedder_name, embeddings) in embeddings {
let user_provided = embedding_configs // don't change the entry if it already exists, because it was user-provided
.iter() vectors.entry(embedder_name).or_insert_with(|| {
.find(|conf| conf.name == embedder_name) let embeddings = ExplicitVectors {
.is_some_and(|conf| conf.user_provided.contains(id)); embeddings: VectorOrArrayOfVectors::from_array_of_vectors(
embeddings,
let embeddings = ExplicitVectors { ),
embeddings: Some( user_provided: false,
VectorOrArrayOfVectors::from_array_of_vectors(embeddings), };
), serde_json::to_value(embeddings).unwrap()
regenerate: !user_provided, });
};
vectors.insert(
embedder_name,
serde_json::to_value(embeddings).unwrap(),
);
} }
} }

View File

@ -53,7 +53,6 @@ use meilisearch_types::heed::byteorder::BE;
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128};
use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn};
use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::documents::DocumentsBatchBuilder;
use meilisearch_types::milli::index::IndexEmbeddingConfig;
use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
@ -1460,39 +1459,33 @@ impl IndexScheduler {
// TODO: consider using a type alias or a struct embedder/template // TODO: consider using a type alias or a struct embedder/template
pub fn embedders( pub fn embedders(
&self, &self,
embedding_configs: Vec<IndexEmbeddingConfig>, embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>,
) -> Result<EmbeddingConfigs> { ) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs let res: Result<_> = embedding_configs
.into_iter() .into_iter()
.map( .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| {
|IndexEmbeddingConfig { let prompt =
name, Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
config: milli::vector::EmbeddingConfig { embedder_options, prompt }, // optimistically return existing embedder
.. {
}| { let embedders = self.embedders.read().unwrap();
let prompt = if let Some(embedder) = embedders.get(&embedder_options) {
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); return Ok((name, (embedder.clone(), prompt)));
// optimistically return existing embedder
{
let embedders = self.embedders.read().unwrap();
if let Some(embedder) = embedders.get(&embedder_options) {
return Ok((name, (embedder.clone(), prompt)));
}
} }
}
// add missing embedder // add missing embedder
let embedder = Arc::new( let embedder = Arc::new(
Embedder::new(embedder_options.clone()) Embedder::new(embedder_options.clone())
.map_err(meilisearch_types::milli::vector::Error::from) .map_err(meilisearch_types::milli::vector::Error::from)
.map_err(meilisearch_types::milli::Error::from)?, .map_err(meilisearch_types::milli::Error::from)?,
); );
{ {
let mut embedders = self.embedders.write().unwrap(); let mut embedders = self.embedders.write().unwrap();
embedders.insert(embedder_options, embedder.clone()); embedders.insert(embedder_options, embedder.clone());
} }
Ok((name, (embedder, prompt))) Ok((name, (embedder, prompt)))
}, })
)
.collect(); .collect();
res.map(EmbeddingConfigs::new) res.map(EmbeddingConfigs::new)
} }
@ -1755,9 +1748,6 @@ mod tests {
use meilisearch_types::milli::update::IndexDocumentsMethod::{ use meilisearch_types::milli::update::IndexDocumentsMethod::{
ReplaceDocuments, UpdateDocuments, ReplaceDocuments, UpdateDocuments,
}; };
use meilisearch_types::milli::update::Setting;
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
use meilisearch_types::settings::Unchecked;
use meilisearch_types::tasks::IndexSwap; use meilisearch_types::tasks::IndexSwap;
use meilisearch_types::VERSION_FILE_NAME; use meilisearch_types::VERSION_FILE_NAME;
use tempfile::{NamedTempFile, TempDir}; use tempfile::{NamedTempFile, TempDir};
@ -1836,7 +1826,6 @@ mod tests {
assert_eq!(breakpoint, (Init, false)); assert_eq!(breakpoint, (Init, false));
let index_scheduler_handle = IndexSchedulerHandle { let index_scheduler_handle = IndexSchedulerHandle {
_tempdir: tempdir, _tempdir: tempdir,
index_scheduler: index_scheduler.private_clone(),
test_breakpoint_rcv: receiver, test_breakpoint_rcv: receiver,
last_breakpoint: breakpoint.0, last_breakpoint: breakpoint.0,
}; };
@ -1925,7 +1914,6 @@ mod tests {
pub struct IndexSchedulerHandle { pub struct IndexSchedulerHandle {
_tempdir: TempDir, _tempdir: TempDir,
index_scheduler: IndexScheduler,
test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>,
last_breakpoint: Breakpoint, last_breakpoint: Breakpoint,
} }
@ -1943,13 +1931,9 @@ mod tests {
{ {
Ok(b) => b, Ok(b) => b,
Err(RecvTimeoutError::Timeout) => { Err(RecvTimeoutError::Timeout) => {
let state = snapshot_index_scheduler(&self.index_scheduler); panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.")
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}")
}
Err(RecvTimeoutError::Disconnected) => {
let state = snapshot_index_scheduler(&self.index_scheduler);
panic!("The scheduler crashed.\n{state}")
} }
Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."),
}; };
// if we've already encountered a breakpoint we're supposed to be stuck on the false // if we've already encountered a breakpoint we're supposed to be stuck on the false
// and we expect the same variant with the true to come now. // and we expect the same variant with the true to come now.
@ -1968,13 +1952,9 @@ mod tests {
{ {
Ok(b) => b, Ok(b) => b,
Err(RecvTimeoutError::Timeout) => { Err(RecvTimeoutError::Timeout) => {
let state = snapshot_index_scheduler(&self.index_scheduler); panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.")
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}")
}
Err(RecvTimeoutError::Disconnected) => {
let state = snapshot_index_scheduler(&self.index_scheduler);
panic!("The scheduler crashed.\n{state}")
} }
Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."),
}; };
assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite");
@ -1988,10 +1968,9 @@ mod tests {
fn advance_till(&mut self, breakpoints: impl IntoIterator<Item = Breakpoint>) { fn advance_till(&mut self, breakpoints: impl IntoIterator<Item = Breakpoint>) {
for breakpoint in breakpoints { for breakpoint in breakpoints {
let b = self.advance(); let b = self.advance();
let state = snapshot_index_scheduler(&self.index_scheduler);
assert_eq!( assert_eq!(
b, breakpoint, b, breakpoint,
"Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{state}", "Was expecting the breakpoint `{:?}` but instead got `{:?}`.",
breakpoint, b breakpoint, b
); );
} }
@ -2016,7 +1995,6 @@ mod tests {
// Wait for one successful batch. // Wait for one successful batch.
#[track_caller] #[track_caller]
fn advance_one_successful_batch(&mut self) { fn advance_one_successful_batch(&mut self) {
self.index_scheduler.assert_internally_consistent();
self.advance_till([Start, BatchCreated]); self.advance_till([Start, BatchCreated]);
loop { loop {
match self.advance() { match self.advance() {
@ -2025,17 +2003,13 @@ mod tests {
InsideProcessBatch => (), InsideProcessBatch => (),
// the batch went successfully, we can stop the loop and go on with the next states. // the batch went successfully, we can stop the loop and go on with the next states.
ProcessBatchSucceeded => break, ProcessBatchSucceeded => break,
AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), AbortedIndexation => panic!("The batch was aborted."),
ProcessBatchFailed => { ProcessBatchFailed => panic!("The batch failed."),
while self.advance() != Start {}
panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler))
},
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
} }
} }
self.advance_till([AfterProcessing]); self.advance_till([AfterProcessing]);
self.index_scheduler.assert_internally_consistent();
} }
// Wait for one failed batch. // Wait for one failed batch.
@ -2049,8 +2023,8 @@ mod tests {
InsideProcessBatch => (), InsideProcessBatch => (),
// the batch went failed, we can stop the loop and go on with the next states. // the batch went failed, we can stop the loop and go on with the next states.
ProcessBatchFailed => break, ProcessBatchFailed => break,
ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)), ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)"),
AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), AbortedIndexation => panic!("The batch was aborted."),
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
} }
} }
@ -3078,10 +3052,8 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let configs = index.embedding_configs(&rtxn).unwrap(); let configs = index.embedding_configs(&rtxn).unwrap();
let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); let (_, embedding_config) = configs.first().unwrap();
insta::assert_snapshot!(name, @"default"); insta::assert_json_snapshot!(embedding_config.embedder_options);
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(config.embedder_options);
} }
#[test] #[test]
@ -5017,6 +4989,7 @@ mod tests {
false, false,
) )
.unwrap(); .unwrap();
index_scheduler.assert_internally_consistent();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors");
@ -5027,7 +5000,7 @@ mod tests {
insta::assert_json_snapshot!(task.details); insta::assert_json_snapshot!(task.details);
} }
handle.advance_one_successful_batch(); handle.advance_n_successful_batches(1);
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors");
{ {
@ -5044,17 +5017,13 @@ mod tests {
let configs = index.embedding_configs(&rtxn).unwrap(); let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below // for consistency with the below
#[allow(clippy::get_first)] #[allow(clippy::get_first)]
let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = let (name, fakerest_config) = configs.get(0).unwrap();
configs.get(0).unwrap(); insta::assert_json_snapshot!(name, @r###""A_fakerest""###);
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(fakerest_config.embedder_options); insta::assert_json_snapshot!(fakerest_config.embedder_options);
let fakerest_name = name.clone(); let fakerest_name = name.clone();
let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = let (name, simple_hf_config) = configs.get(1).unwrap();
configs.get(1).unwrap(); insta::assert_json_snapshot!(name, @r###""B_small_hf""###);
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(simple_hf_config.embedder_options); insta::assert_json_snapshot!(simple_hf_config.embedder_options);
let simple_hf_name = name.clone(); let simple_hf_name = name.clone();
@ -5069,25 +5038,25 @@ mod tests {
// add one doc, specifying vectors // add one doc, specifying vectors
let doc = serde_json::json!( let doc = serde_json::json!(
{ {
"id": 0, "id": 0,
"doggo": "Intel", "doggo": "Intel",
"breed": "beagle", "breed": "beagle",
"_vectors": { "_vectors": {
&fakerest_name: { &fakerest_name: {
// this will never trigger regeneration, which is good because we can't actually generate with // this will never trigger regeneration, which is good because we can't actually generate with
// this embedder // this embedder
"regenerate": false, "userProvided": true,
"embeddings": beagle_embed, "embeddings": beagle_embed,
}, },
&simple_hf_name: { &simple_hf_name: {
// this will be regenerated on updates // this will be regenerated on updates
"regenerate": true, "userProvided": false,
"embeddings": lab_embed, "embeddings": lab_embed,
}, },
"noise": [0.1, 0.2, 0.3] "noise": [0.1, 0.2, 0.3]
} }
} }
); );
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap();
@ -5109,6 +5078,7 @@ mod tests {
false, false,
) )
.unwrap(); .unwrap();
index_scheduler.assert_internally_consistent();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel");
@ -5121,19 +5091,6 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap(); let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } =
configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap(); let embeddings = index.embeddings(&rtxn, 0).unwrap();
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
@ -5183,6 +5140,7 @@ mod tests {
false, false,
) )
.unwrap(); .unwrap();
index_scheduler.assert_internally_consistent();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir");
@ -5195,25 +5153,11 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap(); let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } =
configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let IndexEmbeddingConfig { name, config: _, user_provided } =
configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap(); let embeddings = index.embeddings(&rtxn, 0).unwrap();
// automatically changed to patou because set to regenerate // automatically changed to patou
assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true");
// remained beagle // remained beagle because set to userProvided
assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true");
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
@ -5232,578 +5176,4 @@ mod tests {
} }
} }
} }
#[test]
fn import_vectors_first_and_embedder_later() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"my_doggo_embedder": vec![1; 384],
"unknown embedder": vec![1, 2, 3],
}
},
{
"id": 2,
"doggo": "max",
"_vectors": {
"my_doggo_embedder": {
"regenerate": false,
"embeddings": vec![2; 384],
},
"unknown embedder": vec![4, 5],
},
},
{
"id": 3,
"doggo": "marcel",
"_vectors": {
"my_doggo_embedder": {
"regenerate": true,
"embeddings": vec![3; 384],
},
},
},
{
"id": 4,
"doggo": "sora",
"_vectors": {
"my_doggo_embedder": {
"regenerate": true,
},
},
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"5");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
document_template: Setting::Set(S("{{doc.doggo}}")),
..Default::default()
})
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
// the all the vectors linked to the new specified embedder have been removed
// Only the unknown embedders stays in the document DB
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###);
let conf = index.embedding_configs(&rtxn).unwrap();
// even though we specified the vector for the ID 3, it shouldn't be marked
// as user provided since we explicitely marked it as NOT user provided.
snapshot!(format!("{conf:#?}"), @r###"
[
IndexEmbeddingConfig {
name: "my_doggo_embedder",
config: EmbeddingConfig {
embedder_options: HuggingFace(
EmbedderOptions {
model: "sentence-transformers/all-MiniLM-L6-v2",
revision: Some(
"e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
),
distribution: None,
},
),
prompt: PromptData {
template: "{{doc.doggo}}",
},
},
user_provided: RoaringBitmap<[1, 2]>,
},
]
"###);
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty(), "{embedding:?}");
// the document with the id 3 should keep its original embedding
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let mut embeddings = Vec::new();
'vectors: for i in 0..=u8::MAX {
let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata(_) => Ok(None),
e => Err(e),
})
.transpose();
let Some(reader) = reader else {
break 'vectors;
};
let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap();
if let Some(embedding) = embedding {
embeddings.push(embedding)
} else {
break 'vectors;
}
}
snapshot!(embeddings.len(), @"1");
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
// If we update marcel it should regenerate its embedding automatically
let content = serde_json::json!(
[
{
"id": 3,
"doggo": "marvel",
},
{
"id": 4,
"doggo": "sorry",
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: UpdateDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
// the document with the id 3 should have its original embedding updated
let rtxn = index.read_txn().unwrap();
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let doc = index.documents(&rtxn, Some(docid)).unwrap()[0];
let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap();
snapshot!(json_string!(doc), @r###"
{
"id": 3,
"doggo": "marvel"
}
"###);
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty());
assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
// the document with the id 4 should generate an embedding
let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty());
}
#[test]
fn delete_document_containing_vector() {
// 1. Add an embedder
// 2. Push two documents containing a simple vector
// 3. Delete the first document
// 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore
// 5. Clear the index
// 6. The user defined roaring bitmap shouldn't contains the id of the second document
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("manual") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided),
dimensions: Setting::Set(3),
..Default::default()
})
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
"_vectors": {
"manual": vec![0, 0, 0],
}
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"manual": vec![1, 1, 1],
}
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
index_scheduler
.register(
KindWithContent::DocumentDeletion {
index_uid: S("doggos"),
documents_ids: vec![S("1")],
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###);
let conf = index.embedding_configs(&rtxn).unwrap();
snapshot!(format!("{conf:#?}"), @r###"
[
IndexEmbeddingConfig {
name: "manual",
config: EmbeddingConfig {
embedder_options: UserProvided(
EmbedderOptions {
dimensions: 3,
distribution: None,
},
),
prompt: PromptData {
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
},
},
user_provided: RoaringBitmap<[0]>,
},
]
"###);
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["manual"];
assert!(!embedding.is_empty(), "{embedding:?}");
index_scheduler
.register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false)
.unwrap();
handle.advance_one_successful_batch();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @"[]");
let conf = index.embedding_configs(&rtxn).unwrap();
snapshot!(format!("{conf:#?}"), @r###"
[
IndexEmbeddingConfig {
name: "manual",
config: EmbeddingConfig {
embedder_options: UserProvided(
EmbedderOptions {
dimensions: 3,
distribution: None,
},
),
prompt: PromptData {
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
},
},
user_provided: RoaringBitmap<[]>,
},
]
"###);
}
#[test]
fn delete_embedder_with_user_provided_vectors() {
// 1. Add two embedders
// 2. Push two documents containing a simple vector
// 3. The documents must not contain the vectors after the update as they are in the vectors db
// 3. Delete the embedders
// 4. The documents contain the vectors again
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("manual") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided),
dimensions: Setting::Set(3),
..Default::default()
}),
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
document_template: Setting::Set(S("{{doc.doggo}}")),
..Default::default()
}),
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
"_vectors": {
"manual": vec![0, 0, 0],
"my_doggo_embedder": vec![1; 384],
}
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"manual": vec![1, 1, 1],
}
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
{
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###);
}
{
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("manual") => Setting::Reset,
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
}
{
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###);
}
{
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Reset,
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
}
{
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
// FIXME: redaction
snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###);
}
}
} }

View File

@ -6,6 +6,10 @@ expression: doc
"doggo": "kefir", "doggo": "kefir",
"breed": "patou", "breed": "patou",
"_vectors": { "_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [ "noise": [
0.1, 0.1,
0.2, 0.2,

View File

@ -6,6 +6,10 @@ expression: doc
"doggo": "Intel", "doggo": "Intel",
"breed": "beagle", "breed": "beagle",
"_vectors": { "_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [ "noise": [
0.1, 0.1,
0.2, 0.2,

View File

@ -11,7 +11,7 @@ edition.workspace = true
license.workspace = true license.workspace = true
[dependencies] [dependencies]
actix-web = { version = "4.6.0", default-features = false } actix-web = { version = "4.5.1", default-features = false }
anyhow = "1.0.79" anyhow = "1.0.79"
convert_case = "0.6.0" convert_case = "0.6.0"
csv = "1.3.0" csv = "1.3.0"
@ -30,12 +30,7 @@ serde_json = "1.0.111"
tar = "0.4.40" tar = "0.4.40"
tempfile = "3.9.0" tempfile = "3.9.0"
thiserror = "1.0.56" thiserror = "1.0.56"
time = { version = "0.3.31", features = [ time = { version = "0.3.31", features = ["serde-well-known", "formatting", "parsing", "macros"] }
"serde-well-known",
"formatting",
"parsing",
"macros",
] }
tokio = "1.35" tokio = "1.35"
uuid = { version = "1.6.1", features = ["serde", "v4"] } uuid = { version = "1.6.1", features = ["serde", "v4"] }

View File

@ -189,6 +189,4 @@ merge_with_error_impl_take_error_message!(ParseTaskKindError);
merge_with_error_impl_take_error_message!(ParseTaskStatusError); merge_with_error_impl_take_error_message!(ParseTaskStatusError);
merge_with_error_impl_take_error_message!(IndexUidFormatError); merge_with_error_impl_take_error_message!(IndexUidFormatError);
merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio); merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio);
merge_with_error_impl_take_error_message!(InvalidSearchRankingScoreThreshold);
merge_with_error_impl_take_error_message!(InvalidSimilarRankingScoreThreshold);
merge_with_error_impl_take_error_message!(InvalidSimilarId); merge_with_error_impl_take_error_message!(InvalidSimilarId);

View File

@ -222,7 +222,6 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ;
InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ;
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
@ -241,11 +240,7 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ;
InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;
@ -273,14 +268,13 @@ InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ;
InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
InvalidSearchSort , InvalidRequest , BAD_REQUEST ; InvalidSearchSort , InvalidRequest , BAD_REQUEST ;
InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ;
InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ;
InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ; InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ;
InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ; InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ; InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ;
InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ; InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ;
InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ; InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ;
@ -385,7 +379,6 @@ impl ErrorCode for milli::Error {
Code::IndexPrimaryKeyMultipleCandidatesFound Code::IndexPrimaryKeyMultipleCandidatesFound
} }
UserError::PrimaryKeyCannotBeChanged(_) => Code::IndexPrimaryKeyAlreadyExists, UserError::PrimaryKeyCannotBeChanged(_) => Code::IndexPrimaryKeyAlreadyExists,
UserError::InvalidDistinctAttribute { .. } => Code::InvalidSearchDistinct,
UserError::SortRankingRuleMissing => Code::InvalidSearchSort, UserError::SortRankingRuleMissing => Code::InvalidSearchSort,
UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets, UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets,
UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort, UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort,
@ -398,8 +391,7 @@ impl ErrorCode for milli::Error {
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules, UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::InvalidVectorsMapType { .. } UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType,
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
UserError::TooManyVectors(_, _) => Code::TooManyVectors, UserError::TooManyVectors(_, _) => Code::TooManyVectors,
UserError::SortError(_) => Code::InvalidSearchSort, UserError::SortError(_) => Code::InvalidSearchSort,
UserError::InvalidMinTypoWordLenSetting(_, _) => { UserError::InvalidMinTypoWordLenSetting(_, _) => {
@ -513,21 +505,6 @@ impl fmt::Display for deserr_codes::InvalidSimilarId {
} }
} }
impl fmt::Display for deserr_codes::InvalidSearchRankingScoreThreshold {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`."
)
}
}
impl fmt::Display for deserr_codes::InvalidSimilarRankingScoreThreshold {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
deserr_codes::InvalidSearchRankingScoreThreshold.fmt(f)
}
}
#[macro_export] #[macro_export]
macro_rules! internal_error { macro_rules! internal_error {
($target:ty : $($other:path), *) => { ($target:ty : $($other:path), *) => {

View File

@ -8,7 +8,6 @@ use std::str::FromStr;
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
use fst::IntoStreamer; use fst::IntoStreamer;
use milli::index::IndexEmbeddingConfig;
use milli::proximity::ProximityPrecision; use milli::proximity::ProximityPrecision;
use milli::update::Setting; use milli::update::Setting;
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
@ -673,7 +672,7 @@ pub fn settings(
let embedders: BTreeMap<_, _> = index let embedders: BTreeMap<_, _> = index
.embedding_configs(rtxn)? .embedding_configs(rtxn)?
.into_iter() .into_iter()
.map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into()))) .map(|(name, config)| (name, Setting::Set(config.into())))
.collect(); .collect();
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };

View File

@ -14,20 +14,20 @@ default-run = "meilisearch"
[dependencies] [dependencies]
actix-cors = "0.7.0" actix-cors = "0.7.0"
actix-http = { version = "3.7.0", default-features = false, features = [ actix-http = { version = "3.6.0", default-features = false, features = [
"compress-brotli", "compress-brotli",
"compress-gzip", "compress-gzip",
"rustls-0_21", "rustls-0_21",
] } ] }
actix-utils = "3.0.1" actix-utils = "3.0.1"
actix-web = { version = "4.6.0", default-features = false, features = [ actix-web = { version = "4.5.1", default-features = false, features = [
"macros", "macros",
"compress-brotli", "compress-brotli",
"compress-gzip", "compress-gzip",
"cookies", "cookies",
"rustls-0_21", "rustls-0_21",
] } ] }
actix-web-static-files = { version = "4.0.1", optional = true } actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true }
anyhow = { version = "1.0.79", features = ["backtrace"] } anyhow = { version = "1.0.79", features = ["backtrace"] }
async-stream = "0.3.5" async-stream = "0.3.5"
async-trait = "0.1.77" async-trait = "0.1.77"
@ -105,20 +105,19 @@ url = { version = "2.5.0", features = ["serde"] }
tracing = "0.1.40" tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-subscriber = { version = "0.3.18", features = ["json"] }
tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-trace = { version = "0.1.0", path = "../tracing-trace" }
tracing-actix-web = "0.7.10" tracing-actix-web = "0.7.9"
build-info = { version = "1.7.0", path = "../build-info" } build-info = { version = "1.7.0", path = "../build-info" }
[dev-dependencies] [dev-dependencies]
actix-rt = "2.9.0" actix-rt = "2.9.0"
assert-json-diff = "2.0.2" assert-json-diff = "2.0.2"
brotli = "6.0.0" brotli = "3.4.0"
insta = "1.34.0" insta = "1.34.0"
manifest-dir-macros = "0.1.18" manifest-dir-macros = "0.1.18"
maplit = "1.0.2" maplit = "1.0.2"
meili-snap = { path = "../meili-snap" } meili-snap = { path = "../meili-snap" }
temp-env = "0.3.6" temp-env = "0.3.6"
urlencoding = "2.1.3" urlencoding = "2.1.3"
wiremock = "0.6.0"
yaup = "0.2.1" yaup = "0.2.1"
[build-dependencies] [build-dependencies]
@ -159,5 +158,5 @@ vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
[package.metadata.mini-dashboard] [package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe" sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff"

View File

@ -74,8 +74,8 @@ pub enum DocumentDeletionKind {
#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum DocumentFetchKind { pub enum DocumentFetchKind {
PerDocumentId { retrieve_vectors: bool }, PerDocumentId,
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, Normal { with_filter: bool, limit: usize, offset: usize },
} }
pub trait Analytics: Sync + Send { pub trait Analytics: Sync + Send {

View File

@ -597,9 +597,6 @@ pub struct SearchAggregator {
// every time a request has a filter, this field must be incremented by one // every time a request has a filter, this field must be incremented by one
sort_total_number_of_criteria: usize, sort_total_number_of_criteria: usize,
// distinct
distinct: bool,
// filter // filter
filter_with_geo_radius: bool, filter_with_geo_radius: bool,
filter_with_geo_bounding_box: bool, filter_with_geo_bounding_box: bool,
@ -625,7 +622,6 @@ pub struct SearchAggregator {
// Whether a non-default embedder was specified // Whether a non-default embedder was specified
embedder: bool, embedder: bool,
hybrid: bool, hybrid: bool,
retrieve_vectors: bool,
// every time a search is done, we increment the counter linked to the used settings // every time a search is done, we increment the counter linked to the used settings
matching_strategy: HashMap<String, usize>, matching_strategy: HashMap<String, usize>,
@ -652,7 +648,6 @@ pub struct SearchAggregator {
// scoring // scoring
show_ranking_score: bool, show_ranking_score: bool,
show_ranking_score_details: bool, show_ranking_score_details: bool,
ranking_score_threshold: bool,
} }
impl SearchAggregator { impl SearchAggregator {
@ -666,7 +661,6 @@ impl SearchAggregator {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve: _, attributes_to_retrieve: _,
retrieve_vectors,
attributes_to_crop: _, attributes_to_crop: _,
crop_length, crop_length,
attributes_to_highlight: _, attributes_to_highlight: _,
@ -675,7 +669,6 @@ impl SearchAggregator {
show_ranking_score_details, show_ranking_score_details,
filter, filter,
sort, sort,
distinct,
facets: _, facets: _,
highlight_pre_tag, highlight_pre_tag,
highlight_post_tag, highlight_post_tag,
@ -683,7 +676,6 @@ impl SearchAggregator {
matching_strategy, matching_strategy,
attributes_to_search_on, attributes_to_search_on,
hybrid, hybrid,
ranking_score_threshold,
} = query; } = query;
let mut ret = Self::default(); let mut ret = Self::default();
@ -698,8 +690,6 @@ impl SearchAggregator {
ret.sort_sum_of_criteria_terms = sort.len(); ret.sort_sum_of_criteria_terms = sort.len();
} }
ret.distinct = distinct.is_some();
if let Some(ref filter) = filter { if let Some(ref filter) = filter {
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap()); static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
ret.filter_total_number_of_criteria = 1; ret.filter_total_number_of_criteria = 1;
@ -736,7 +726,6 @@ impl SearchAggregator {
if let Some(ref vector) = vector { if let Some(ref vector) = vector {
ret.max_vector_size = vector.len(); ret.max_vector_size = vector.len();
} }
ret.retrieve_vectors |= retrieve_vectors;
if query.is_finite_pagination() { if query.is_finite_pagination() {
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
@ -759,7 +748,6 @@ impl SearchAggregator {
ret.show_ranking_score = *show_ranking_score; ret.show_ranking_score = *show_ranking_score;
ret.show_ranking_score_details = *show_ranking_score_details; ret.show_ranking_score_details = *show_ranking_score_details;
ret.ranking_score_threshold = ranking_score_threshold.is_some();
if let Some(hybrid) = hybrid { if let Some(hybrid) = hybrid {
ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO();
@ -804,7 +792,6 @@ impl SearchAggregator {
sort_with_geo_point, sort_with_geo_point,
sort_sum_of_criteria_terms, sort_sum_of_criteria_terms,
sort_total_number_of_criteria, sort_total_number_of_criteria,
distinct,
filter_with_geo_radius, filter_with_geo_radius,
filter_with_geo_bounding_box, filter_with_geo_bounding_box,
filter_sum_of_criteria_terms, filter_sum_of_criteria_terms,
@ -813,7 +800,6 @@ impl SearchAggregator {
attributes_to_search_on_total_number_of_uses, attributes_to_search_on_total_number_of_uses,
max_terms_number, max_terms_number,
max_vector_size, max_vector_size,
retrieve_vectors,
matching_strategy, matching_strategy,
max_limit, max_limit,
max_offset, max_offset,
@ -835,7 +821,6 @@ impl SearchAggregator {
hybrid, hybrid,
total_degraded, total_degraded,
total_used_negative_operator, total_used_negative_operator,
ranking_score_threshold,
} = other; } = other;
if self.timestamp.is_none() { if self.timestamp.is_none() {
@ -862,9 +847,6 @@ impl SearchAggregator {
self.sort_total_number_of_criteria = self.sort_total_number_of_criteria =
self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria);
// distinct
self.distinct |= distinct;
// filter // filter
self.filter_with_geo_radius |= filter_with_geo_radius; self.filter_with_geo_radius |= filter_with_geo_radius;
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
@ -887,7 +869,6 @@ impl SearchAggregator {
// vector // vector
self.max_vector_size = self.max_vector_size.max(max_vector_size); self.max_vector_size = self.max_vector_size.max(max_vector_size);
self.retrieve_vectors |= retrieve_vectors;
self.semantic_ratio |= semantic_ratio; self.semantic_ratio |= semantic_ratio;
self.hybrid |= hybrid; self.hybrid |= hybrid;
self.embedder |= embedder; self.embedder |= embedder;
@ -923,7 +904,6 @@ impl SearchAggregator {
// scoring // scoring
self.show_ranking_score |= show_ranking_score; self.show_ranking_score |= show_ranking_score;
self.show_ranking_score_details |= show_ranking_score_details; self.show_ranking_score_details |= show_ranking_score_details;
self.ranking_score_threshold |= ranking_score_threshold;
} }
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
@ -936,7 +916,6 @@ impl SearchAggregator {
sort_with_geo_point, sort_with_geo_point,
sort_sum_of_criteria_terms, sort_sum_of_criteria_terms,
sort_total_number_of_criteria, sort_total_number_of_criteria,
distinct,
filter_with_geo_radius, filter_with_geo_radius,
filter_with_geo_bounding_box, filter_with_geo_bounding_box,
filter_sum_of_criteria_terms, filter_sum_of_criteria_terms,
@ -945,7 +924,6 @@ impl SearchAggregator {
attributes_to_search_on_total_number_of_uses, attributes_to_search_on_total_number_of_uses,
max_terms_number, max_terms_number,
max_vector_size, max_vector_size,
retrieve_vectors,
matching_strategy, matching_strategy,
max_limit, max_limit,
max_offset, max_offset,
@ -967,7 +945,6 @@ impl SearchAggregator {
hybrid, hybrid,
total_degraded, total_degraded,
total_used_negative_operator, total_used_negative_operator,
ranking_score_threshold,
} = self; } = self;
if total_received == 0 { if total_received == 0 {
@ -994,7 +971,6 @@ impl SearchAggregator {
"with_geoPoint": sort_with_geo_point, "with_geoPoint": sort_with_geo_point,
"avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64),
}, },
"distinct": distinct,
"filter": { "filter": {
"with_geoRadius": filter_with_geo_radius, "with_geoRadius": filter_with_geo_radius,
"with_geoBoundingBox": filter_with_geo_bounding_box, "with_geoBoundingBox": filter_with_geo_bounding_box,
@ -1009,7 +985,6 @@ impl SearchAggregator {
}, },
"vector": { "vector": {
"max_vector_size": max_vector_size, "max_vector_size": max_vector_size,
"retrieve_vectors": retrieve_vectors,
}, },
"hybrid": { "hybrid": {
"enabled": hybrid, "enabled": hybrid,
@ -1040,7 +1015,6 @@ impl SearchAggregator {
"scoring": { "scoring": {
"show_ranking_score": show_ranking_score, "show_ranking_score": show_ranking_score,
"show_ranking_score_details": show_ranking_score_details, "show_ranking_score_details": show_ranking_score_details,
"ranking_score_threshold": ranking_score_threshold,
}, },
}); });
@ -1098,7 +1072,6 @@ impl MultiSearchAggregator {
page: _, page: _,
hits_per_page: _, hits_per_page: _,
attributes_to_retrieve: _, attributes_to_retrieve: _,
retrieve_vectors: _,
attributes_to_crop: _, attributes_to_crop: _,
crop_length: _, crop_length: _,
attributes_to_highlight: _, attributes_to_highlight: _,
@ -1107,7 +1080,6 @@ impl MultiSearchAggregator {
show_matches_position: _, show_matches_position: _,
filter: _, filter: _,
sort: _, sort: _,
distinct: _,
facets: _, facets: _,
highlight_pre_tag: _, highlight_pre_tag: _,
highlight_post_tag: _, highlight_post_tag: _,
@ -1115,7 +1087,6 @@ impl MultiSearchAggregator {
matching_strategy: _, matching_strategy: _,
attributes_to_search_on: _, attributes_to_search_on: _,
hybrid: _, hybrid: _,
ranking_score_threshold: _,
} = query; } = query;
index_uid.as_str() index_uid.as_str()
@ -1263,7 +1234,6 @@ impl FacetSearchAggregator {
matching_strategy, matching_strategy,
attributes_to_search_on, attributes_to_search_on,
hybrid, hybrid,
ranking_score_threshold,
} = query; } = query;
let mut ret = Self::default(); let mut ret = Self::default();
@ -1278,8 +1248,7 @@ impl FacetSearchAggregator {
|| filter.is_some() || filter.is_some()
|| *matching_strategy != MatchingStrategy::default() || *matching_strategy != MatchingStrategy::default()
|| attributes_to_search_on.is_some() || attributes_to_search_on.is_some()
|| hybrid.is_some() || hybrid.is_some();
|| ranking_score_threshold.is_some();
ret ret
} }
@ -1555,9 +1524,6 @@ pub struct DocumentsFetchAggregator {
// if a filter was used // if a filter was used
per_filter: bool, per_filter: bool,
#[serde(rename = "vector.retrieve_vectors")]
retrieve_vectors: bool,
// pagination // pagination
#[serde(rename = "pagination.max_limit")] #[serde(rename = "pagination.max_limit")]
max_limit: usize, max_limit: usize,
@ -1567,21 +1533,18 @@ pub struct DocumentsFetchAggregator {
impl DocumentsFetchAggregator { impl DocumentsFetchAggregator {
pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self {
let (limit, offset, retrieve_vectors) = match query { let (limit, offset) = match query {
DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), DocumentFetchKind::PerDocumentId => (1, 0),
DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset),
(*limit, *offset, *retrieve_vectors)
}
}; };
Self { Self {
timestamp: Some(OffsetDateTime::now_utc()), timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(), user_agents: extract_user_agents(request).into_iter().collect(),
total_received: 1, total_received: 1,
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_document_id: matches!(query, DocumentFetchKind::PerDocumentId),
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
max_limit: limit, max_limit: limit,
max_offset: offset, max_offset: offset,
retrieve_vectors,
} }
} }
@ -1595,7 +1558,6 @@ impl DocumentsFetchAggregator {
per_filter, per_filter,
max_limit, max_limit,
max_offset, max_offset,
retrieve_vectors,
} = other; } = other;
if self.timestamp.is_none() { if self.timestamp.is_none() {
@ -1611,8 +1573,6 @@ impl DocumentsFetchAggregator {
self.max_limit = self.max_limit.max(max_limit); self.max_limit = self.max_limit.max(max_limit);
self.max_offset = self.max_offset.max(max_offset); self.max_offset = self.max_offset.max(max_offset);
self.retrieve_vectors |= retrieve_vectors;
} }
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
@ -1653,7 +1613,6 @@ pub struct SimilarAggregator {
// Whether a non-default embedder was specified // Whether a non-default embedder was specified
embedder: bool, embedder: bool,
retrieve_vectors: bool,
// pagination // pagination
max_limit: usize, max_limit: usize,
@ -1665,7 +1624,6 @@ pub struct SimilarAggregator {
// scoring // scoring
show_ranking_score: bool, show_ranking_score: bool,
show_ranking_score_details: bool, show_ranking_score_details: bool,
ranking_score_threshold: bool,
} }
impl SimilarAggregator { impl SimilarAggregator {
@ -1677,11 +1635,9 @@ impl SimilarAggregator {
offset, offset,
limit, limit,
attributes_to_retrieve: _, attributes_to_retrieve: _,
retrieve_vectors,
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
filter, filter,
ranking_score_threshold,
} = query; } = query;
let mut ret = Self::default(); let mut ret = Self::default();
@ -1719,10 +1675,8 @@ impl SimilarAggregator {
ret.show_ranking_score = *show_ranking_score; ret.show_ranking_score = *show_ranking_score;
ret.show_ranking_score_details = *show_ranking_score_details; ret.show_ranking_score_details = *show_ranking_score_details;
ret.ranking_score_threshold = ranking_score_threshold.is_some();
ret.embedder = embedder.is_some(); ret.embedder = embedder.is_some();
ret.retrieve_vectors = *retrieve_vectors;
ret ret
} }
@ -1754,8 +1708,6 @@ impl SimilarAggregator {
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
embedder, embedder,
ranking_score_threshold,
retrieve_vectors,
} = other; } = other;
if self.timestamp.is_none() { if self.timestamp.is_none() {
@ -1785,7 +1737,6 @@ impl SimilarAggregator {
} }
self.embedder |= embedder; self.embedder |= embedder;
self.retrieve_vectors |= retrieve_vectors;
// pagination // pagination
self.max_limit = self.max_limit.max(max_limit); self.max_limit = self.max_limit.max(max_limit);
@ -1798,7 +1749,6 @@ impl SimilarAggregator {
// scoring // scoring
self.show_ranking_score |= show_ranking_score; self.show_ranking_score |= show_ranking_score;
self.show_ranking_score_details |= show_ranking_score_details; self.show_ranking_score_details |= show_ranking_score_details;
self.ranking_score_threshold |= ranking_score_threshold;
} }
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
@ -1819,8 +1769,6 @@ impl SimilarAggregator {
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
embedder, embedder,
ranking_score_threshold,
retrieve_vectors,
} = self; } = self;
if total_received == 0 { if total_received == 0 {
@ -1847,9 +1795,6 @@ impl SimilarAggregator {
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
}, },
"vector": {
"retrieve_vectors": retrieve_vectors,
},
"hybrid": { "hybrid": {
"embedder": embedder, "embedder": embedder,
}, },
@ -1863,7 +1808,6 @@ impl SimilarAggregator {
"scoring": { "scoring": {
"show_ranking_score": show_ranking_score, "show_ranking_score": show_ranking_score,
"show_ranking_score_details": show_ranking_score_details, "show_ranking_score_details": show_ranking_score_details,
"ranking_score_threshold": ranking_score_threshold,
}, },
}); });

View File

@ -16,7 +16,6 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::heed::RoTxn; use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::update::IndexDocumentsMethod;
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
use meilisearch_types::milli::DocumentId; use meilisearch_types::milli::DocumentId;
use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::star_or::OptionStarOrList;
use meilisearch_types::tasks::KindWithContent; use meilisearch_types::tasks::KindWithContent;
@ -40,7 +39,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::{ use crate::routes::{
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
}; };
use crate::search::{parse_filter, RetrieveVectors}; use crate::search::parse_filter;
use crate::Opt; use crate::Opt;
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| { static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
@ -95,8 +94,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
pub struct GetDocument { pub struct GetDocument {
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)] #[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
fields: OptionStarOrList<String>, fields: OptionStarOrList<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: Param<bool>,
} }
pub async fn get_document( pub async fn get_document(
@ -110,20 +107,13 @@ pub async fn get_document(
debug!(parameters = ?params, "Get document"); debug!(parameters = ?params, "Get document");
let index_uid = IndexUid::try_from(index_uid)?; let index_uid = IndexUid::try_from(index_uid)?;
let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner(); analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req);
let GetDocument { fields } = params.into_inner();
let attributes_to_retrieve = fields.merge_star_and_none(); let attributes_to_retrieve = fields.merge_star_and_none();
let features = index_scheduler.features();
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
analytics.get_fetch_documents(
&DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
&req,
);
let index = index_scheduler.index(&index_uid)?; let index = index_scheduler.index(&index_uid)?;
let document = let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?;
retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?;
debug!(returns = ?document, "Get document"); debug!(returns = ?document, "Get document");
Ok(HttpResponse::Ok().json(document)) Ok(HttpResponse::Ok().json(document))
} }
@ -163,8 +153,6 @@ pub struct BrowseQueryGet {
limit: Param<usize>, limit: Param<usize>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)] #[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
fields: OptionStarOrList<String>, fields: OptionStarOrList<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)] #[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)]
filter: Option<String>, filter: Option<String>,
} }
@ -178,8 +166,6 @@ pub struct BrowseQuery {
limit: usize, limit: usize,
#[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)] #[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)]
fields: Option<Vec<String>>, fields: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)] #[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
filter: Option<Value>, filter: Option<Value>,
} }
@ -199,7 +185,6 @@ pub async fn documents_by_query_post(
with_filter: body.filter.is_some(), with_filter: body.filter.is_some(),
limit: body.limit, limit: body.limit,
offset: body.offset, offset: body.offset,
retrieve_vectors: body.retrieve_vectors,
}, },
&req, &req,
); );
@ -216,7 +201,7 @@ pub async fn get_documents(
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?params, "Get documents GET"); debug!(parameters = ?params, "Get documents GET");
let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner();
let filter = match filter { let filter = match filter {
Some(f) => match serde_json::from_str(&f) { Some(f) => match serde_json::from_str(&f) {
@ -230,7 +215,6 @@ pub async fn get_documents(
offset: offset.0, offset: offset.0,
limit: limit.0, limit: limit.0,
fields: fields.merge_star_and_none(), fields: fields.merge_star_and_none(),
retrieve_vectors: retrieve_vectors.0,
filter, filter,
}; };
@ -239,7 +223,6 @@ pub async fn get_documents(
with_filter: query.filter.is_some(), with_filter: query.filter.is_some(),
limit: query.limit, limit: query.limit,
offset: query.offset, offset: query.offset,
retrieve_vectors: query.retrieve_vectors,
}, },
&req, &req,
); );
@ -253,14 +236,10 @@ fn documents_by_query(
query: BrowseQuery, query: BrowseQuery,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; let BrowseQuery { offset, limit, fields, filter } = query;
let features = index_scheduler.features();
let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?;
let index = index_scheduler.index(&index_uid)?; let index = index_scheduler.index(&index_uid)?;
let (total, documents) = let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?;
retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?;
let ret = PaginationView::new(offset, limit, total as usize, documents); let ret = PaginationView::new(offset, limit, total as usize, documents);
@ -600,44 +579,13 @@ fn some_documents<'a, 't: 'a>(
index: &'a Index, index: &'a Index,
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
doc_ids: impl IntoIterator<Item = DocumentId> + 'a, doc_ids: impl IntoIterator<Item = DocumentId> + 'a,
retrieve_vectors: RetrieveVectors,
) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> { ) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index.embedding_configs(rtxn)?;
Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> {
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?)
match retrieve_vectors {
RetrieveVectors::Ignore => {}
RetrieveVectors::Hide => {
document.remove("_vectors");
}
RetrieveVectors::Retrieve => {
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, key)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(key));
let embeddings = ExplicitVectors {
embeddings: Some(vector.into()),
regenerate: !user_provided,
};
vectors.insert(
name,
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
);
}
document.insert("_vectors".into(), vectors.into());
}
}
Ok(document)
}) })
})) }))
} }
@ -648,7 +596,6 @@ fn retrieve_documents<S: AsRef<str>>(
limit: usize, limit: usize,
filter: Option<Value>, filter: Option<Value>,
attributes_to_retrieve: Option<Vec<S>>, attributes_to_retrieve: Option<Vec<S>>,
retrieve_vectors: RetrieveVectors,
) -> Result<(u64, Vec<Document>), ResponseError> { ) -> Result<(u64, Vec<Document>), ResponseError> {
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
let filter = &filter; let filter = &filter;
@ -673,57 +620,53 @@ fn retrieve_documents<S: AsRef<str>>(
let (it, number_of_documents) = { let (it, number_of_documents) = {
let number_of_documents = candidates.len(); let number_of_documents = candidates.len();
( (
some_documents( some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?,
index,
&rtxn,
candidates.into_iter().skip(offset).take(limit),
retrieve_vectors,
)?,
number_of_documents, number_of_documents,
) )
}; };
let documents: Vec<_> = it let documents: Result<Vec<_>, ResponseError> = it
.map(|document| { .map(|document| {
Ok(match &attributes_to_retrieve { Ok(match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values( Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document?, &document?,
attributes_to_retrieve.iter().map(|s| s.as_ref()).chain( attributes_to_retrieve.iter().map(|s| s.as_ref()),
(retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
),
), ),
None => document?, None => document?,
}) })
}) })
.collect::<Result<_, ResponseError>>()?; .collect();
Ok((number_of_documents, documents)) Ok((number_of_documents, documents?))
} }
fn retrieve_document<S: AsRef<str>>( fn retrieve_document<S: AsRef<str>>(
index: &Index, index: &Index,
doc_id: &str, doc_id: &str,
attributes_to_retrieve: Option<Vec<S>>, attributes_to_retrieve: Option<Vec<S>>,
retrieve_vectors: RetrieveVectors,
) -> Result<Document, ResponseError> { ) -> Result<Document, ResponseError> {
let txn = index.read_txn()?; let txn = index.read_txn()?;
let fields_ids_map = index.fields_ids_map(&txn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let internal_id = index let internal_id = index
.external_documents_ids() .external_documents_ids()
.get(&txn, doc_id)? .get(&txn, doc_id)?
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)? let document = index
.documents(&txn, std::iter::once(internal_id))?
.into_iter()
.next() .next()
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??; .map(|(_, d)| d)
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
let document = match &attributes_to_retrieve { let document = match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values( Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document, &document,
attributes_to_retrieve attributes_to_retrieve.iter().map(|s| s.as_ref()),
.iter()
.map(|s| s.as_ref())
.chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
), ),
None => document, None => document,
}; };

View File

@ -14,8 +14,8 @@ use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::routes::indexes::search::search_kind; use crate::routes::indexes::search::search_kind;
use crate::search::{ use crate::search::{
add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, SearchQuery,
SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
}; };
use crate::search_queue::SearchQueue; use crate::search_queue::SearchQueue;
@ -46,8 +46,6 @@ pub struct FacetSearchQuery {
pub matching_strategy: MatchingStrategy, pub matching_strategy: MatchingStrategy,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToSearchOn>, default)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToSearchOn>, default)]
pub attributes_to_search_on: Option<Vec<String>>, pub attributes_to_search_on: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)]
pub ranking_score_threshold: Option<RankingScoreThreshold>,
} }
pub async fn search( pub async fn search(
@ -105,7 +103,6 @@ impl From<FacetSearchQuery> for SearchQuery {
matching_strategy, matching_strategy,
attributes_to_search_on, attributes_to_search_on,
hybrid, hybrid,
ranking_score_threshold,
} = value; } = value;
SearchQuery { SearchQuery {
@ -115,7 +112,6 @@ impl From<FacetSearchQuery> for SearchQuery {
page: None, page: None,
hits_per_page: None, hits_per_page: None,
attributes_to_retrieve: None, attributes_to_retrieve: None,
retrieve_vectors: false,
attributes_to_crop: None, attributes_to_crop: None,
crop_length: DEFAULT_CROP_LENGTH(), crop_length: DEFAULT_CROP_LENGTH(),
attributes_to_highlight: None, attributes_to_highlight: None,
@ -124,7 +120,6 @@ impl From<FacetSearchQuery> for SearchQuery {
show_ranking_score_details: false, show_ranking_score_details: false,
filter, filter,
sort: None, sort: None,
distinct: None,
facets: None, facets: None,
highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(), highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(),
highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(), highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(),
@ -133,7 +128,6 @@ impl From<FacetSearchQuery> for SearchQuery {
vector, vector,
attributes_to_search_on, attributes_to_search_on,
hybrid, hybrid,
ranking_score_threshold,
} }
} }
} }

View File

@ -19,10 +19,9 @@ use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
use crate::search::{ use crate::search::{
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, add_search_rules, perform_search, HybridQuery, MatchingStrategy, SearchKind, SearchQuery,
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
}; };
use crate::search_queue::SearchQueue; use crate::search_queue::SearchQueue;
@ -51,8 +50,6 @@ pub struct SearchQueryGet {
hits_per_page: Option<Param<usize>>, hits_per_page: Option<Param<usize>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)]
attributes_to_retrieve: Option<CS<String>>, attributes_to_retrieve: Option<CS<String>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)]
attributes_to_crop: Option<CS<String>>, attributes_to_crop: Option<CS<String>>,
#[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)] #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)]
@ -63,8 +60,6 @@ pub struct SearchQueryGet {
filter: Option<String>, filter: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchSort>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchSort>)]
sort: Option<String>, sort: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchDistinct>)]
distinct: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowMatchesPosition>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchShowMatchesPosition>)]
show_matches_position: Param<bool>, show_matches_position: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowRankingScore>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchShowRankingScore>)]
@ -87,21 +82,6 @@ pub struct SearchQueryGet {
pub hybrid_embedder: Option<String>, pub hybrid_embedder: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchSemanticRatio>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchSemanticRatio>)]
pub hybrid_semantic_ratio: Option<SemanticRatioGet>, pub hybrid_semantic_ratio: Option<SemanticRatioGet>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchRankingScoreThreshold>)]
pub ranking_score_threshold: Option<RankingScoreThresholdGet>,
}
#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)]
#[deserr(try_from(String) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)]
pub struct RankingScoreThresholdGet(RankingScoreThreshold);
impl std::convert::TryFrom<String> for RankingScoreThresholdGet {
type Error = InvalidSearchRankingScoreThreshold;
fn try_from(s: String) -> Result<Self, Self::Error> {
let f: f64 = s.parse().map_err(|_| InvalidSearchRankingScoreThreshold)?;
Ok(RankingScoreThresholdGet(RankingScoreThreshold::try_from(f)?))
}
} }
#[derive(Debug, Clone, Copy, Default, PartialEq, deserr::Deserr)] #[derive(Debug, Clone, Copy, Default, PartialEq, deserr::Deserr)]
@ -157,13 +137,11 @@ impl From<SearchQueryGet> for SearchQuery {
page: other.page.as_deref().copied(), page: other.page.as_deref().copied(),
hits_per_page: other.hits_per_page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(),
attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()),
retrieve_vectors: other.retrieve_vectors.0,
attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()),
crop_length: other.crop_length.0, crop_length: other.crop_length.0,
attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()),
filter, filter,
sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)), sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)),
distinct: other.distinct,
show_matches_position: other.show_matches_position.0, show_matches_position: other.show_matches_position.0,
show_ranking_score: other.show_ranking_score.0, show_ranking_score: other.show_ranking_score.0,
show_ranking_score_details: other.show_ranking_score_details.0, show_ranking_score_details: other.show_ranking_score_details.0,
@ -174,7 +152,6 @@ impl From<SearchQueryGet> for SearchQuery {
matching_strategy: other.matching_strategy, matching_strategy: other.matching_strategy,
attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()), attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()),
hybrid, hybrid,
ranking_score_threshold: other.ranking_score_threshold.map(|o| o.0),
} }
} }
} }
@ -228,12 +205,10 @@ pub async fn search_with_url_query(
let features = index_scheduler.features(); let features = index_scheduler.features();
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?;
let _permit = search_queue.try_get_search_permit().await?; let _permit = search_queue.try_get_search_permit().await?;
let search_result = tokio::task::spawn_blocking(move || { let search_result =
perform_search(&index, query, search_kind, retrieve_vector) tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
})
.await?;
if let Ok(ref search_result) = search_result { if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result); aggregate.succeed(search_result);
} }
@ -270,13 +245,10 @@ pub async fn search_with_post(
let features = index_scheduler.features(); let features = index_scheduler.features();
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
let _permit = search_queue.try_get_search_permit().await?; let _permit = search_queue.try_get_search_permit().await?;
let search_result = tokio::task::spawn_blocking(move || { let search_result =
perform_search(&index, query, search_kind, retrieve_vectors) tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
})
.await?;
if let Ok(ref search_result) = search_result { if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result); aggregate.succeed(search_result);
if search_result.degraded { if search_result.degraded {
@ -298,10 +270,11 @@ pub fn search_kind(
features: RoFeatures, features: RoFeatures,
) -> Result<SearchKind, ResponseError> { ) -> Result<SearchKind, ResponseError> {
if query.vector.is_some() { if query.vector.is_some() {
features.check_vector("Passing `vector` as a parameter")?; features.check_vector("Passing `vector` as a query parameter")?;
} }
if query.hybrid.is_some() { if query.hybrid.is_some() {
features.check_vector("Passing `hybrid` as a parameter")?; features.check_vector("Passing `hybrid` as a query parameter")?;
} }
// regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing

View File

@ -4,7 +4,11 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter};
use index_scheduler::IndexScheduler; use index_scheduler::IndexScheduler;
use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::deserr_codes::{
InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId,
InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarShowRankingScore,
InvalidSimilarShowRankingScoreDetails,
};
use meilisearch_types::error::{ErrorCode as _, ResponseError}; use meilisearch_types::error::{ErrorCode as _, ResponseError};
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::keys::actions; use meilisearch_types::keys::actions;
@ -17,8 +21,8 @@ use crate::analytics::{Analytics, SimilarAggregator};
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::search::{ use crate::search::{
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, add_search_rules, perform_similar, SearchKind, SimilarQuery, SimilarResult,
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
}; };
pub fn configure(cfg: &mut web::ServiceConfig) { pub fn configure(cfg: &mut web::ServiceConfig) {
@ -38,7 +42,9 @@ pub async fn similar_get(
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let query = params.0.try_into()?; let query = params.0.try_into().map_err(|code: InvalidSimilarId| {
ResponseError::from_msg(code.to_string(), code.error_code())
})?;
let mut aggregate = SimilarAggregator::from_query(&query, &req); let mut aggregate = SimilarAggregator::from_query(&query, &req);
@ -93,8 +99,6 @@ async fn similar(
features.check_vector("Using the similar API")?; features.check_vector("Using the similar API")?;
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
// Tenant token search_rules. // Tenant token search_rules.
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) {
add_search_rules(&mut query.filter, search_rules); add_search_rules(&mut query.filter, search_rules);
@ -105,10 +109,8 @@ async fn similar(
let (embedder_name, embedder) = let (embedder_name, embedder) =
SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?; SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?;
tokio::task::spawn_blocking(move || { tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder))
perform_similar(&index, query, embedder_name, embedder, retrieve_vectors) .await?
})
.await?
} }
#[derive(Debug, deserr::Deserr)] #[derive(Debug, deserr::Deserr)]
@ -122,35 +124,18 @@ pub struct SimilarQueryGet {
limit: Param<usize>, limit: Param<usize>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)] #[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)]
attributes_to_retrieve: Option<CS<String>>, attributes_to_retrieve: Option<CS<String>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)] #[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)]
filter: Option<String>, filter: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)] #[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)]
show_ranking_score: Param<bool>, show_ranking_score: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScoreDetails>)] #[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScoreDetails>)]
show_ranking_score_details: Param<bool>, show_ranking_score_details: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarRankingScoreThreshold>, default)]
pub ranking_score_threshold: Option<RankingScoreThresholdGet>,
#[deserr(default, error = DeserrQueryParamError<InvalidEmbedder>)] #[deserr(default, error = DeserrQueryParamError<InvalidEmbedder>)]
pub embedder: Option<String>, pub embedder: Option<String>,
} }
#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)]
#[deserr(try_from(String) = TryFrom::try_from -> InvalidSimilarRankingScoreThreshold)]
pub struct RankingScoreThresholdGet(RankingScoreThresholdSimilar);
impl std::convert::TryFrom<String> for RankingScoreThresholdGet {
type Error = InvalidSimilarRankingScoreThreshold;
fn try_from(s: String) -> Result<Self, Self::Error> {
let f: f64 = s.parse().map_err(|_| InvalidSimilarRankingScoreThreshold)?;
Ok(RankingScoreThresholdGet(RankingScoreThresholdSimilar::try_from(f)?))
}
}
impl TryFrom<SimilarQueryGet> for SimilarQuery { impl TryFrom<SimilarQueryGet> for SimilarQuery {
type Error = ResponseError; type Error = InvalidSimilarId;
fn try_from( fn try_from(
SimilarQueryGet { SimilarQueryGet {
@ -158,12 +143,10 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
offset, offset,
limit, limit,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
filter, filter,
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
embedder, embedder,
ranking_score_threshold,
}: SimilarQueryGet, }: SimilarQueryGet,
) -> Result<Self, Self::Error> { ) -> Result<Self, Self::Error> {
let filter = match filter { let filter = match filter {
@ -175,18 +158,14 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
}; };
Ok(SimilarQuery { Ok(SimilarQuery {
id: id.0.try_into().map_err(|code: InvalidSimilarId| { id: id.0.try_into()?,
ResponseError::from_msg(code.to_string(), code.error_code())
})?,
offset: offset.0, offset: offset.0,
limit: limit.0, limit: limit.0,
filter, filter,
embedder, embedder,
attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()), attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()),
retrieve_vectors: retrieve_vectors.0,
show_ranking_score: show_ranking_score.0, show_ranking_score: show_ranking_score.0,
show_ranking_score_details: show_ranking_score_details.0, show_ranking_score_details: show_ranking_score_details.0,
ranking_score_threshold: ranking_score_threshold.map(|x| x.0),
}) })
} }
} }

View File

@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::indexes::search::search_kind; use crate::routes::indexes::search::search_kind;
use crate::search::{ use crate::search::{
add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex, add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex,
}; };
use crate::search_queue::SearchQueue; use crate::search_queue::SearchQueue;
@ -83,14 +83,11 @@ pub async fn multi_search_with_post(
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features) let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)
.with_index(query_index)?; .with_index(query_index)?;
let retrieve_vector =
RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?;
let search_result = tokio::task::spawn_blocking(move || { let search_result =
perform_search(&index, query, search_kind, retrieve_vector) tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind))
}) .await
.await .with_index(query_index)?;
.with_index(query_index)?;
search_results.push(SearchResultWithIndex { search_results.push(SearchResultWithIndex {
index_uid: index_uid.into_inner(), index_uid: index_uid.into_inner(),

View File

@ -15,7 +15,6 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::heed::RoTxn; use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::vector::Embedder;
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget};
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
@ -60,8 +59,6 @@ pub struct SearchQuery {
pub hits_per_page: Option<usize>, pub hits_per_page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>, pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
pub attributes_to_crop: Option<Vec<String>>, pub attributes_to_crop: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())] #[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
@ -78,8 +75,6 @@ pub struct SearchQuery {
pub filter: Option<Value>, pub filter: Option<Value>,
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)] #[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
pub sort: Option<Vec<String>>, pub sort: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchDistinct>)]
pub distinct: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)] #[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
pub facets: Option<Vec<String>>, pub facets: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())] #[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
@ -92,44 +87,6 @@ pub struct SearchQuery {
pub matching_strategy: MatchingStrategy, pub matching_strategy: MatchingStrategy,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToSearchOn>, default)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToSearchOn>, default)]
pub attributes_to_search_on: Option<Vec<String>>, pub attributes_to_search_on: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)]
pub ranking_score_threshold: Option<RankingScoreThreshold>,
}
#[derive(Debug, Clone, Copy, PartialEq, Deserr)]
#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)]
pub struct RankingScoreThreshold(f64);
impl std::convert::TryFrom<f64> for RankingScoreThreshold {
type Error = InvalidSearchRankingScoreThreshold;
fn try_from(f: f64) -> Result<Self, Self::Error> {
// the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable
#[allow(clippy::manual_range_contains)]
if f > 1.0 || f < 0.0 {
Err(InvalidSearchRankingScoreThreshold)
} else {
Ok(RankingScoreThreshold(f))
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Deserr)]
#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSimilarRankingScoreThreshold)]
pub struct RankingScoreThresholdSimilar(f64);
impl std::convert::TryFrom<f64> for RankingScoreThresholdSimilar {
type Error = InvalidSimilarRankingScoreThreshold;
fn try_from(f: f64) -> Result<Self, Self::Error> {
// the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable
#[allow(clippy::manual_range_contains)]
if f > 1.0 || f < 0.0 {
Err(InvalidSimilarRankingScoreThreshold)
} else {
Ok(Self(f))
}
}
} }
// Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum. // Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum.
@ -146,7 +103,6 @@ impl fmt::Debug for SearchQuery {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
@ -155,14 +111,12 @@ impl fmt::Debug for SearchQuery {
show_ranking_score_details, show_ranking_score_details,
filter, filter,
sort, sort,
distinct,
facets, facets,
highlight_pre_tag, highlight_pre_tag,
highlight_post_tag, highlight_post_tag,
crop_marker, crop_marker,
matching_strategy, matching_strategy,
attributes_to_search_on, attributes_to_search_on,
ranking_score_threshold,
} = self; } = self;
let mut debug = f.debug_struct("SearchQuery"); let mut debug = f.debug_struct("SearchQuery");
@ -180,9 +134,6 @@ impl fmt::Debug for SearchQuery {
if let Some(q) = q { if let Some(q) = q {
debug.field("q", &q); debug.field("q", &q);
} }
if *retrieve_vectors {
debug.field("retrieve_vectors", &retrieve_vectors);
}
if let Some(v) = vector { if let Some(v) = vector {
if v.len() < 10 { if v.len() < 10 {
debug.field("vector", &v); debug.field("vector", &v);
@ -205,9 +156,6 @@ impl fmt::Debug for SearchQuery {
if let Some(sort) = sort { if let Some(sort) = sort {
debug.field("sort", &sort); debug.field("sort", &sort);
} }
if let Some(distinct) = distinct {
debug.field("distinct", &distinct);
}
if let Some(facets) = facets { if let Some(facets) = facets {
debug.field("facets", &facets); debug.field("facets", &facets);
} }
@ -240,9 +188,6 @@ impl fmt::Debug for SearchQuery {
debug.field("highlight_pre_tag", &highlight_pre_tag); debug.field("highlight_pre_tag", &highlight_pre_tag);
debug.field("highlight_post_tag", &highlight_post_tag); debug.field("highlight_post_tag", &highlight_post_tag);
debug.field("crop_marker", &crop_marker); debug.field("crop_marker", &crop_marker);
if let Some(ranking_score_threshold) = ranking_score_threshold {
debug.field("ranking_score_threshold", &ranking_score_threshold);
}
debug.finish() debug.finish()
} }
@ -383,8 +328,6 @@ pub struct SearchQueryWithIndex {
pub hits_per_page: Option<usize>, pub hits_per_page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>, pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
pub attributes_to_crop: Option<Vec<String>>, pub attributes_to_crop: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())] #[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
@ -401,8 +344,6 @@ pub struct SearchQueryWithIndex {
pub filter: Option<Value>, pub filter: Option<Value>,
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)] #[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
pub sort: Option<Vec<String>>, pub sort: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchDistinct>)]
pub distinct: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)] #[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
pub facets: Option<Vec<String>>, pub facets: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())] #[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
@ -415,8 +356,6 @@ pub struct SearchQueryWithIndex {
pub matching_strategy: MatchingStrategy, pub matching_strategy: MatchingStrategy,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToSearchOn>, default)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToSearchOn>, default)]
pub attributes_to_search_on: Option<Vec<String>>, pub attributes_to_search_on: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)]
pub ranking_score_threshold: Option<RankingScoreThreshold>,
} }
impl SearchQueryWithIndex { impl SearchQueryWithIndex {
@ -430,7 +369,6 @@ impl SearchQueryWithIndex {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
@ -439,7 +377,6 @@ impl SearchQueryWithIndex {
show_matches_position, show_matches_position,
filter, filter,
sort, sort,
distinct,
facets, facets,
highlight_pre_tag, highlight_pre_tag,
highlight_post_tag, highlight_post_tag,
@ -447,7 +384,6 @@ impl SearchQueryWithIndex {
matching_strategy, matching_strategy,
attributes_to_search_on, attributes_to_search_on,
hybrid, hybrid,
ranking_score_threshold,
} = self; } = self;
( (
index_uid, index_uid,
@ -459,7 +395,6 @@ impl SearchQueryWithIndex {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
@ -468,7 +403,6 @@ impl SearchQueryWithIndex {
show_matches_position, show_matches_position,
filter, filter,
sort, sort,
distinct,
facets, facets,
highlight_pre_tag, highlight_pre_tag,
highlight_post_tag, highlight_post_tag,
@ -476,7 +410,6 @@ impl SearchQueryWithIndex {
matching_strategy, matching_strategy,
attributes_to_search_on, attributes_to_search_on,
hybrid, hybrid,
ranking_score_threshold,
// do not use ..Default::default() here, // do not use ..Default::default() here,
// rather add any missing field from `SearchQuery` to `SearchQueryWithIndex` // rather add any missing field from `SearchQuery` to `SearchQueryWithIndex`
}, },
@ -499,14 +432,10 @@ pub struct SimilarQuery {
pub embedder: Option<String>, pub embedder: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)] #[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>, pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSimilarRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)] #[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)]
pub show_ranking_score: bool, pub show_ranking_score: bool,
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)] #[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)]
pub show_ranking_score_details: bool, pub show_ranking_score_details: bool,
#[deserr(default, error = DeserrJsonError<InvalidSimilarRankingScoreThreshold>, default)]
pub ranking_score_threshold: Option<RankingScoreThresholdSimilar>,
} }
#[derive(Debug, Clone, PartialEq, Deserr)] #[derive(Debug, Clone, PartialEq, Deserr)]
@ -548,8 +477,6 @@ pub enum MatchingStrategy {
Last, Last,
/// All query words are mandatory /// All query words are mandatory
All, All,
/// Remove query words from the most frequent to the least
Frequency,
} }
impl Default for MatchingStrategy { impl Default for MatchingStrategy {
@ -563,7 +490,6 @@ impl From<MatchingStrategy> for TermsMatchingStrategy {
match other { match other {
MatchingStrategy::Last => Self::Last, MatchingStrategy::Last => Self::Last,
MatchingStrategy::All => Self::All, MatchingStrategy::All => Self::All,
MatchingStrategy::Frequency => Self::Frequency,
} }
} }
} }
@ -735,13 +661,6 @@ fn prepare_search<'t>(
) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> { ) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> {
let mut search = index.search(rtxn); let mut search = index.search(rtxn);
search.time_budget(time_budget); search.time_budget(time_budget);
if let Some(ranking_score_threshold) = query.ranking_score_threshold {
search.ranking_score_threshold(ranking_score_threshold.0);
}
if let Some(distinct) = &query.distinct {
search.distinct(distinct.clone());
}
match search_kind { match search_kind {
SearchKind::KeywordOnly => { SearchKind::KeywordOnly => {
@ -783,16 +702,11 @@ fn prepare_search<'t>(
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS); .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS);
search.exhaustive_number_hits(is_finite_pagination); search.exhaustive_number_hits(is_finite_pagination);
search.scoring_strategy( search.scoring_strategy(if query.show_ranking_score || query.show_ranking_score_details {
if query.show_ranking_score ScoringStrategy::Detailed
|| query.show_ranking_score_details } else {
|| query.ranking_score_threshold.is_some() ScoringStrategy::Skip
{ });
ScoringStrategy::Detailed
} else {
ScoringStrategy::Skip
},
);
// compute the offset on the limit depending on the pagination mode. // compute the offset on the limit depending on the pagination mode.
let (offset, limit) = if is_finite_pagination { let (offset, limit) = if is_finite_pagination {
@ -837,7 +751,6 @@ pub fn perform_search(
index: &Index, index: &Index,
query: SearchQuery, query: SearchQuery,
search_kind: SearchKind, search_kind: SearchKind,
retrieve_vectors: RetrieveVectors,
) -> Result<SearchResult, MeilisearchHttpError> { ) -> Result<SearchResult, MeilisearchHttpError> {
let before_search = Instant::now(); let before_search = Instant::now();
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
@ -871,37 +784,32 @@ pub fn perform_search(
let SearchQuery { let SearchQuery {
q, q,
vector: _,
hybrid: _,
// already computed from prepare_search
offset: _,
limit, limit,
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
// use the enum passed as parameter
retrieve_vectors: _,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
show_matches_position, show_matches_position,
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
filter: _,
sort, sort,
facets, facets,
highlight_pre_tag, highlight_pre_tag,
highlight_post_tag, highlight_post_tag,
crop_marker, crop_marker,
// already used in prepare_search
vector: _,
hybrid: _,
offset: _,
ranking_score_threshold: _,
matching_strategy: _, matching_strategy: _,
attributes_to_search_on: _, attributes_to_search_on: _,
filter: _,
distinct: _,
} = query; } = query;
let format = AttributesFormat { let format = AttributesFormat {
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_highlight, attributes_to_highlight,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
@ -985,7 +893,6 @@ pub fn perform_search(
struct AttributesFormat { struct AttributesFormat {
attributes_to_retrieve: Option<BTreeSet<String>>, attributes_to_retrieve: Option<BTreeSet<String>>,
retrieve_vectors: RetrieveVectors,
attributes_to_highlight: Option<HashSet<String>>, attributes_to_highlight: Option<HashSet<String>>,
attributes_to_crop: Option<Vec<String>>, attributes_to_crop: Option<Vec<String>>,
crop_length: usize, crop_length: usize,
@ -998,36 +905,6 @@ struct AttributesFormat {
show_ranking_score_details: bool, show_ranking_score_details: bool,
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RetrieveVectors {
/// Do not touch the `_vectors` field
///
/// this is the behavior when the vectorStore feature is disabled
Ignore,
/// Remove the `_vectors` field
///
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false`
Hide,
/// Retrieve vectors from the DB and merge them into the `_vectors` field
///
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true`
Retrieve,
}
impl RetrieveVectors {
pub fn new(
retrieve_vector: bool,
features: index_scheduler::RoFeatures,
) -> Result<Self, index_scheduler::Error> {
match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) {
(true, Ok(())) => Ok(Self::Retrieve),
(true, Err(error)) => Err(error),
(false, Ok(())) => Ok(Self::Hide),
(false, Err(_)) => Ok(Self::Ignore),
}
}
}
fn make_hits( fn make_hits(
index: &Index, index: &Index,
rtxn: &RoTxn<'_>, rtxn: &RoTxn<'_>,
@ -1037,32 +914,10 @@ fn make_hits(
document_scores: Vec<Vec<ScoreDetails>>, document_scores: Vec<Vec<ScoreDetails>>,
) -> Result<Vec<SearchHit>, MeilisearchHttpError> { ) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
let fields_ids_map = index.fields_ids_map(rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(rtxn).unwrap();
let displayed_ids = let displayed_ids = index
index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::<BTreeSet<_>>()); .displayed_fields_ids(rtxn)?
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>())
let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
(None, _) => false,
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
(Some(_), None) => true,
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
};
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
if vectors_is_hidden {
RetrieveVectors::Hide
} else {
RetrieveVectors::Retrieve
}
} else {
format.retrieve_vectors
};
let displayed_ids =
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
let fids = |attrs: &BTreeSet<String>| { let fids = |attrs: &BTreeSet<String>| {
let mut ids = BTreeSet::new(); let mut ids = BTreeSet::new();
for attr in attrs { for attr in attrs {
@ -1085,7 +940,6 @@ fn make_hits(
.intersection(&displayed_ids) .intersection(&displayed_ids)
.cloned() .cloned()
.collect(); .collect();
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
let formatted_options = compute_formatted_options( let formatted_options = compute_formatted_options(
@ -1119,48 +973,18 @@ fn make_hits(
formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_prefix(format.highlight_pre_tag);
formatter_builder.highlight_suffix(format.highlight_post_tag); formatter_builder.highlight_suffix(format.highlight_post_tag);
let mut documents = Vec::new(); let mut documents = Vec::new();
let embedding_configs = index.embedding_configs(rtxn)?;
let documents_iter = index.documents(rtxn, documents_ids)?; let documents_iter = index.documents(rtxn, documents_ids)?;
for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
// First generate a document with all the displayed fields // First generate a document with all the displayed fields
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
let add_vectors_fid =
vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve);
// select the attributes to retrieve // select the attributes to retrieve
let attributes_to_retrieve = to_retrieve_ids let attributes_to_retrieve = to_retrieve_ids
.iter() .iter()
// skip the vectors_fid if RetrieveVectors::Hide
.filter(|fid| match vectors_fid {
Some(vectors_fid) => {
!(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
}
None => true,
})
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
.chain(add_vectors_fid.iter())
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name")); .map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
let mut document = let mut document =
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
if retrieve_vectors == RetrieveVectors::Retrieve {
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, id)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(id));
let embeddings =
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
vectors.insert(name, serde_json::to_value(embeddings)?);
}
document.insert("_vectors".into(), vectors.into());
}
let (matches_position, formatted) = format_fields( let (matches_position, formatted) = format_fields(
&displayed_document, &displayed_document,
&fields_ids_map, &fields_ids_map,
@ -1230,7 +1054,6 @@ pub fn perform_similar(
query: SimilarQuery, query: SimilarQuery,
embedder_name: String, embedder_name: String,
embedder: Arc<Embedder>, embedder: Arc<Embedder>,
retrieve_vectors: RetrieveVectors,
) -> Result<SimilarResult, ResponseError> { ) -> Result<SimilarResult, ResponseError> {
let before_search = Instant::now(); let before_search = Instant::now();
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
@ -1242,10 +1065,8 @@ pub fn perform_similar(
filter: _, filter: _,
embedder: _, embedder: _,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors: _,
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
ranking_score_threshold,
} = query; } = query;
// using let-else rather than `?` so that the borrow checker identifies we're always returning here, // using let-else rather than `?` so that the borrow checker identifies we're always returning here,
@ -1269,10 +1090,6 @@ pub fn perform_similar(
} }
} }
if let Some(ranking_score_threshold) = ranking_score_threshold {
similar.ranking_score_threshold(ranking_score_threshold.0);
}
let milli::SearchResult { let milli::SearchResult {
documents_ids, documents_ids,
matching_words: _, matching_words: _,
@ -1289,7 +1106,6 @@ pub fn perform_similar(
let format = AttributesFormat { let format = AttributesFormat {
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_highlight: None, attributes_to_highlight: None,
attributes_to_crop: None, attributes_to_crop: None,
crop_length: DEFAULT_CROP_LENGTH(), crop_length: DEFAULT_CROP_LENGTH(),

View File

@ -40,9 +40,8 @@ pub struct Permit {
impl Drop for Permit { impl Drop for Permit {
fn drop(&mut self) { fn drop(&mut self) {
let sender = self.sender.clone();
// if the channel is closed then the whole instance is down // if the channel is closed then the whole instance is down
std::mem::drop(tokio::spawn(async move { sender.send(()).await })); let _ = futures::executor::block_on(self.sender.send(()));
} }
} }

View File

@ -182,10 +182,14 @@ impl Index<'_> {
self.service.get(url).await self.service.get(url).await
} }
pub async fn get_document(&self, id: u64, options: Option<Value>) -> (Value, StatusCode) { pub async fn get_document(
&self,
id: u64,
options: Option<GetDocumentOptions>,
) -> (Value, StatusCode) {
let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id); let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id);
if let Some(options) = options { if let Some(fields) = options.and_then(|o| o.fields) {
write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap(); let _ = write!(url, "?fields={}", fields.join(","));
} }
self.service.get(url).await self.service.get(url).await
} }
@ -201,11 +205,18 @@ impl Index<'_> {
} }
pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) { pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) {
let url = format!( let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref()));
"/indexes/{}/documents?{}", if let Some(limit) = options.limit {
urlencode(self.uid.as_ref()), let _ = write!(url, "limit={}&", limit);
yaup::to_string(&options).unwrap() }
);
if let Some(offset) = options.offset {
let _ = write!(url, "offset={}&", offset);
}
if let Some(attributes_to_retrieve) = options.attributes_to_retrieve {
let _ = write!(url, "fields={}&", attributes_to_retrieve.join(","));
}
self.service.get(url).await self.service.get(url).await
} }
@ -424,11 +435,13 @@ impl Index<'_> {
} }
} }
#[derive(Debug, Default, serde::Serialize)] pub struct GetDocumentOptions {
#[serde(rename_all = "camelCase")] pub fields: Option<Vec<&'static str>>,
}
#[derive(Debug, Default)]
pub struct GetAllDocumentsOptions { pub struct GetAllDocumentsOptions {
pub limit: Option<usize>, pub limit: Option<usize>,
pub offset: Option<usize>, pub offset: Option<usize>,
pub retrieve_vectors: bool, pub attributes_to_retrieve: Option<Vec<&'static str>>,
pub fields: Option<Vec<&'static str>>,
} }

View File

@ -6,7 +6,7 @@ pub mod service;
use std::fmt::{self, Display}; use std::fmt::{self, Display};
#[allow(unused)] #[allow(unused)]
pub use index::GetAllDocumentsOptions; pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
use meili_snap::json_string; use meili_snap::json_string;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
#[allow(unused)] #[allow(unused)]
@ -65,14 +65,7 @@ impl Display for Value {
write!( write!(
f, f,
"{}", "{}",
json_string!(self, { json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
".enqueuedAt" => "[date]",
".startedAt" => "[date]",
".finishedAt" => "[date]",
".duration" => "[duration]",
".processingTimeMs" => "[duration]",
".details.embedders.*.url" => "[url]"
})
) )
} }
} }

View File

@ -795,70 +795,3 @@ async fn fetch_document_by_filter() {
} }
"###); "###);
} }
#[actix_rt::test]
async fn retrieve_vectors() {
let server = Server::new().await;
let index = server.index("doggo");
// GET ALL DOCUMENTS BY QUERY
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// FETCH ALL DOCUMENTS BY POST
let (response, _code) =
index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// GET A SINGLE DOCUMENT
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
}

View File

@ -4,7 +4,7 @@ use meili_snap::*;
use urlencoding::encode as urlencode; use urlencoding::encode as urlencode;
use crate::common::encoder::Encoder; use crate::common::encoder::Encoder;
use crate::common::{GetAllDocumentsOptions, Server, Value}; use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value};
use crate::json; use crate::json;
// TODO: partial test since we are testing error, amd error is not yet fully implemented in // TODO: partial test since we are testing error, amd error is not yet fully implemented in
@ -59,7 +59,8 @@ async fn get_document() {
}) })
); );
let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await; let (response, code) =
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -68,8 +69,9 @@ async fn get_document() {
}) })
); );
let (response, code) = let (response, code) = index
index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await; .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) }))
.await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -209,7 +211,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["name"]), attributes_to_retrieve: Some(vec!["name"]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -223,19 +225,9 @@ async fn test_get_all_documents_attributes_to_retrieve() {
assert_eq!(response["limit"], json!(20)); assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77)); assert_eq!(response["total"], json!(77));
let (response, code) = index.get_all_documents_raw("?fields=").await;
assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20);
for results in response["results"].as_array().unwrap() {
assert_eq!(results.as_object().unwrap().keys().count(), 0);
}
assert_eq!(response["offset"], json!(0));
assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77));
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["wrong"]), attributes_to_retrieve: Some(vec![]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -250,7 +242,22 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["name", "tags"]), attributes_to_retrieve: Some(vec!["wrong"]),
..Default::default()
})
.await;
assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20);
for results in response["results"].as_array().unwrap() {
assert_eq!(results.as_object().unwrap().keys().count(), 0);
}
assert_eq!(response["offset"], json!(0));
assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77));
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["name", "tags"]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -263,7 +270,10 @@ async fn test_get_all_documents_attributes_to_retrieve() {
} }
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() }) .get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["*"]),
..Default::default()
})
.await; .await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20); assert_eq!(response["results"].as_array().unwrap().len(), 20);
@ -273,7 +283,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["*", "wrong"]), attributes_to_retrieve: Some(vec!["*", "wrong"]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -306,10 +316,12 @@ async fn get_document_s_nested_attributes_to_retrieve() {
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(1).await; index.wait_task(1).await;
let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await; let (response, code) =
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response, json!({})); assert_eq!(response, json!({}));
let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await; let (response, code) =
index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -321,7 +333,9 @@ async fn get_document_s_nested_attributes_to_retrieve() {
}) })
); );
let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await; let (response, code) = index
.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
.await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -329,7 +343,9 @@ async fn get_document_s_nested_attributes_to_retrieve() {
"content.truc": "foobar", "content.truc": "foobar",
}) })
); );
let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await; let (response, code) = index
.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
.await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -524,207 +540,3 @@ async fn get_document_by_filter() {
} }
"###); "###);
} }
#[actix_rt::test]
async fn get_document_with_vectors() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
// by default you shouldn't see the `_vectors` object
let (documents, _code) = index.get_all_documents(Default::default()).await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir"
},
{
"id": 1,
"name": "echo"
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) = index.get_document(0, None).await;
snapshot!(json_string!(documents), @r###"
{
"id": 0,
"name": "kefir"
}
"###);
// if we try to retrieve the vectors with the `fields` parameter they
// still shouldn't be displayed
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["name", "_vectors"]),
..Default::default()
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"name": "kefir"
},
{
"name": "echo"
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) =
index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await;
snapshot!(json_string!(documents), @r###"
{
"name": "kefir"
}
"###);
// If we specify the retrieve vectors boolean and nothing else we should get the vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
snapshot!(json_string!(documents), @r###"
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
}
"###);
// If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
retrieve_vectors: true,
fields: Some(vec!["name"]),
..Default::default()
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) =
index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await;
snapshot!(json_string!(documents), @r###"
{
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
}
"###);
}

View File

@ -1938,210 +1938,3 @@ async fn import_dump_v6_containing_experimental_features() {
}) })
.await; .await;
} }
// In this test we must generate the dump ourselves to ensure the
// `user provided` vectors are well set
#[actix_rt::test]
#[cfg_attr(target_os = "windows", ignore)]
async fn generate_and_import_dump_containing_vectors() {
let temp = tempfile::tempdir().unwrap();
let mut opt = default_settings(temp.path());
let server = Server::new_with_options(opt.clone()).await.unwrap();
let (code, _) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let index = server.index("pets");
let (response, code) = index
.update_settings(json!(
{
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}",
}
}
}
))
.await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response);
let (response, code) = index
.add_documents(
json!([
{"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }},
{"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}},
{"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}},
{"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}},
{"id": 4, "doggo": "max" },
]),
None,
)
.await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response);
let (response, code) = server.create_dump().await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response["status"], @r###""succeeded""###);
// ========= We made a dump, now we should clear the DB and try to import our dump
drop(server);
tokio::fs::remove_dir_all(&opt.db_path).await.unwrap();
let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap());
let dump_path = opt.dump_dir.join(dump_name);
assert!(dump_path.exists(), "path: `{}`", dump_path.display());
opt.import_dump = Some(dump_path);
// NOTE: We shouldn't have to change the database path but I lost one hour
// because of a « bad path » error and that fixed it.
opt.db_path = temp.path().join("data.ms");
let mut server = Server::new_auth_with_options(opt, temp).await;
server.use_api_key("MASTER_KEY");
let (indexes, code) = server.list_indexes(None, None).await;
assert_eq!(code, 200, "{indexes}");
snapshot!(indexes["results"].as_array().unwrap().len(), @"1");
snapshot!(indexes["results"][0]["uid"], @r###""pets""###);
snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###);
let (response, code) = server.get_features().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let index = server.index("pets");
let (response, code) = index.settings().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"words",
"typo",
"proximity",
"attribute",
"sort",
"exactness"
],
"stopWords": [],
"nonSeparatorTokens": [],
"separatorTokens": [],
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
"oneTypo": 5,
"twoTypos": 9
},
"disableOnWords": [],
"disableOnAttributes": []
},
"faceting": {
"maxValuesPerFacet": 100,
"sortFacetValuesBy": {
"*": "alpha"
}
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}"
}
},
"searchCutoffMs": null
}
"###);
index
.search(json!({"retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###"
[
{
"id": 0,
"doggo": "kefir",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": false
}
}
},
{
"id": 1,
"doggo": "echo",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": false
}
}
},
{
"id": 2,
"doggo": "intel",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
},
{
"id": 3,
"doggo": "bill",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
},
{
"id": 4,
"doggo": "max",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
}
]
"###);
})
.await;
}

View File

@ -1,25 +0,0 @@
---
source: meilisearch/tests/dumps/mod.rs
---
{
"uid": 0,
"indexUid": "pets",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}"
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,19 +0,0 @@
---
source: meilisearch/tests/dumps/mod.rs
---
{
"uid": 1,
"indexUid": "pets",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 5,
"indexedDocuments": 5
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -13,7 +13,6 @@ mod snapshot;
mod stats; mod stats;
mod swap_indexes; mod swap_indexes;
mod tasks; mod tasks;
mod vector;
// Tests are isolated by features in different modules to allow better readability, test // Tests are isolated by features in different modules to allow better readability, test
// targetability, and improved incremental compilation times. // targetability, and improved incremental compilation times.

View File

@ -107,39 +107,6 @@ static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
]) ])
}); });
static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"id": 1,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": { "main": "Brown", "pattern": "stripped" },
},
{
"id": 2,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": { "main": "Black", "pattern": "stripped" },
},
{
"id": 3,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": { "main": "Blue", "pattern": "used" },
},
{
"id": 4,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": { "main": "Blue", "pattern": "stripped" },
}
])
});
static DOCUMENT_PRIMARY_KEY: &str = "id"; static DOCUMENT_PRIMARY_KEY: &str = "id";
static DOCUMENT_DISTINCT_KEY: &str = "product_id"; static DOCUMENT_DISTINCT_KEY: &str = "product_id";
@ -272,35 +239,3 @@ async fn distinct_search_with_pagination_no_ranking() {
snapshot!(response["totalPages"], @"2"); snapshot!(response["totalPages"], @"2");
snapshot!(response["totalHits"], @"6"); snapshot!(response["totalHits"], @"6");
} }
#[actix_rt::test]
async fn distinct_at_search_time() {
let server = Server::new().await;
let index = server.index("tamo");
let documents = NESTED_DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
let (task, _) = index.update_settings_filterable_attributes(json!(["color.main"])).await;
let task = index.wait_task(task.uid()).await;
snapshot!(task, name: "succeed");
fn get_hits(response: &Value) -> Vec<String> {
let hits_array = response["hits"]
.as_array()
.unwrap_or_else(|| panic!("{}", &serde_json::to_string_pretty(&response).unwrap()));
hits_array
.iter()
.map(|h| h[DOCUMENT_PRIMARY_KEY].as_number().unwrap().to_string())
.collect::<Vec<_>>()
}
let (response, code) =
index.search_post(json!({"page": 1, "hitsPerPage": 3, "distinct": "color.main"})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"3");
snapshot!(format!("{:?}", hits), @r###"["1", "2", "3"]"###);
snapshot!(response["page"], @"1");
snapshot!(response["totalPages"], @"1");
snapshot!(response["totalHits"], @"3");
}

View File

@ -167,74 +167,6 @@ async fn search_bad_hits_per_page() {
"###); "###);
} }
#[actix_rt::test]
async fn search_bad_attributes_to_retrieve() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`",
"code": "invalid_search_attributes_to_retrieve",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve"
}
"###);
// Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings.
}
#[actix_rt::test]
async fn search_bad_retrieve_vectors() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_get("retrieveVectors=").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_get("retrieveVectors=doggo").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
}
#[actix_rt::test] #[actix_rt::test]
async fn search_bad_attributes_to_crop() { async fn search_bad_attributes_to_crop() {
let server = Server::new().await; let server = Server::new().await;
@ -389,40 +321,6 @@ async fn search_bad_facets() {
// Can't make the `attributes_to_highlight` fail with a get search since it'll accept anything as an array of strings. // Can't make the `attributes_to_highlight` fail with a get search since it'll accept anything as an array of strings.
} }
#[actix_rt::test]
async fn search_bad_threshold() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"rankingScoreThreshold": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.rankingScoreThreshold`: expected a number, but found a string: `\"doggo\"`",
"code": "invalid_search_ranking_score_threshold",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_ranking_score_threshold"
}
"###);
}
#[actix_rt::test]
async fn search_invalid_threshold() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"rankingScoreThreshold": 42})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value at `.rankingScoreThreshold`: the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`.",
"code": "invalid_search_ranking_score_threshold",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_ranking_score_threshold"
}
"###);
}
#[actix_rt::test] #[actix_rt::test]
async fn search_non_filterable_facets() { async fn search_non_filterable_facets() {
let server = Server::new().await; let server = Server::new().await;
@ -607,7 +505,7 @@ async fn search_bad_matching_strategy() {
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`, `frequency`", "message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`",
"code": "invalid_search_matching_strategy", "code": "invalid_search_matching_strategy",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy" "link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"
@ -629,7 +527,7 @@ async fn search_bad_matching_strategy() {
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###" snapshot!(json_string!(response), @r###"
{ {
"message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`, `frequency`", "message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`",
"code": "invalid_search_matching_strategy", "code": "invalid_search_matching_strategy",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy" "link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"
@ -1140,66 +1038,3 @@ async fn search_on_unknown_field_plus_joker() {
) )
.await; .await;
} }
#[actix_rt::test]
async fn distinct_at_search_time() {
let server = Server::new().await;
let index = server.index("tamo");
let (task, _) = index.create(None).await;
let task = index.wait_task(task.uid()).await;
snapshot!(task, name: "task-succeed");
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await;
index.wait_task(task.uid()).await;
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await;
index.wait_task(task.uid()).await;
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Invalid value type at `.distinct`: expected a string, but found a boolean: `true`",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
}

View File

@ -117,69 +117,3 @@ async fn geo_bounding_box_with_string_and_number() {
) )
.await; .await;
} }
#[actix_rt::test]
async fn bug_4640() {
// https://github.com/meilisearch/meilisearch/issues/4640
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, None).await;
index.update_settings_filterable_attributes(json!(["_geo"])).await;
let (ret, _code) = index.update_settings_sortable_attributes(json!(["_geo"])).await;
index.wait_task(ret.uid()).await;
// Sort the document with the second one first
index
.search(
json!({
"sort": ["_geoPoint(45.4777599, 9.1967508):asc"],
}),
|response, code| {
assert_eq!(code, 200, "{}", response);
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
{
"hits": [
{
"id": 2,
"name": "La Bella Italia",
"address": "456 Elm Street, Townsville",
"type": "Italian",
"rating": 9,
"_geo": {
"lat": "45.4777599",
"lng": "9.1967508"
}
},
{
"id": 1,
"name": "Taco Truck",
"address": "444 Salsa Street, Burritoville",
"type": "Mexican",
"rating": 9,
"_geo": {
"lat": 34.0522,
"lng": -118.2437
},
"_geoDistance": 9714063
},
{
"id": 3,
"name": "CrĂŞpe Truck",
"address": "2 Billig Avenue, Rouenville",
"type": "French",
"rating": 10
}
],
"query": "",
"processingTimeMs": "[time]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 3
}
"###);
},
)
.await;
}

View File

@ -124,29 +124,29 @@ async fn simple_search() {
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}), json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"2"); snapshot!(response["semanticHitCount"], @"2");
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}), json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
} }
@ -204,10 +204,10 @@ async fn distribution_shift() {
let server = Server::new().await; let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}});
let (response, code) = index.search_post(search.clone()).await; let (response, code) = index.search_post(search.clone()).await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
let (response, code) = index let (response, code) = index
.update_settings(json!({ .update_settings(json!({
@ -228,7 +228,7 @@ async fn distribution_shift() {
let (response, code) = index.search_post(search).await; let (response, code) = index.search_post(search).await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###);
} }
#[actix_rt::test] #[actix_rt::test]
@ -239,23 +239,20 @@ async fn highlighter() {
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.2}, "hybrid": {"semanticRatio": 0.2},
"retrieveVectors": true, "attributesToHighlight": [
"attributesToHighlight": [ "desc"
"desc",
"_vectors",
], ],
"highlightPreTag": "**BEGIN**", "highlightPreTag": "**BEGIN**",
"highlightPostTag": "**END**", "highlightPostTag": "**END**"
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.8}, "hybrid": {"semanticRatio": 0.8},
"retrieveVectors": true,
"showRankingScore": true, "showRankingScore": true,
"attributesToHighlight": [ "attributesToHighlight": [
"desc" "desc"
@ -265,14 +262,13 @@ async fn highlighter() {
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
// no highlighting on full semantic // no highlighting on full semantic
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 1.0}, "hybrid": {"semanticRatio": 1.0},
"retrieveVectors": true,
"showRankingScore": true, "showRankingScore": true,
"attributesToHighlight": [ "attributesToHighlight": [
"desc" "desc"
@ -282,7 +278,7 @@ async fn highlighter() {
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
} }
@ -365,12 +361,12 @@ async fn single_document() {
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###); snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###);
snapshot!(response["semanticHitCount"], @"1"); snapshot!(response["semanticHitCount"], @"1");
} }
@ -381,25 +377,25 @@ async fn query_combination() {
// search without query and vector, but with hybrid => still placeholder // search without query and vector, but with hybrid => still placeholder
let (response, code) = index let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// same with a different semantic ratio // same with a different semantic ratio
let (response, code) = index let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true})) .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// wrong vector dimensions // wrong vector dimensions
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -414,34 +410,34 @@ async fn query_combination() {
// full vector // full vector
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
// full keyword, without a query // full keyword, without a query
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// query + vector, full keyword => keyword // query + vector, full keyword => keyword
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// query + vector, no hybrid keyword => // query + vector, no hybrid keyword =>
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true})) .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -457,7 +453,7 @@ async fn query_combination() {
// full vector, without a vector => error // full vector, without a vector => error
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
) )
.await; .await;
@ -474,93 +470,11 @@ async fn query_combination() {
// hybrid without a vector => full keyword // hybrid without a vector => full keyword
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}), json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
} }
#[actix_rt::test]
async fn retrieve_vectors() {
let server = Server::new().await;
let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
)
.await;
snapshot!(code, @"200 OK");
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
[
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
}
]
"###);
// remove `_vectors` from displayed attributes
let (response, code) =
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
)
.await;
snapshot!(code, @"200 OK");
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
[
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2"
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3"
},
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1"
}
]
"###);
}

View File

@ -1,128 +0,0 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy;
use crate::common::index::Index;
use crate::common::{Server, Value};
use crate::json;
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
let index = server.index("test");
index.add_documents(documents.clone(), None).await;
index.wait_task(0).await;
index
}
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Shazam!",
"id": "1",
},
{
"title": "Captain Planet",
"id": "2",
},
{
"title": "Captain Marvel",
"id": "3",
},
{
"title": "a Captain Marvel ersatz",
"id": "4"
},
{
"title": "He's not part of the Marvel Cinematic Universe",
"id": "5"
},
{
"title": "a Shazam ersatz, but better than Captain Planet",
"id": "6"
},
{
"title": "Capitain CAAAAAVEEERNE!!!!",
"id": "7"
}
])
});
#[actix_rt::test]
async fn simple_search() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
index
.search(json!({"q": "Captain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"2"},{"id":"6"},{"id":"7"}]"###);
})
.await;
index
.search(json!({"q": "Captain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###);
})
.await;
index
.search(json!({"q": "Captain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
})
.await;
}
#[actix_rt::test]
async fn search_with_typo() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
index
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"7"},{"id":"2"},{"id":"6"}]"###);
})
.await;
index
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###);
})
.await;
index
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
})
.await;
}
#[actix_rt::test]
async fn search_with_unknown_word() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
index
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"2"},{"id":"3"},{"id":"4"},{"id":"6"},{"id":"7"}]"###);
})
.await;
index
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @"[]");
})
.await;
index
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
})
.await;
}

View File

@ -7,7 +7,6 @@ mod facet_search;
mod formatted; mod formatted;
mod geo; mod geo;
mod hybrid; mod hybrid;
mod matching_strategy;
mod multi; mod multi;
mod pagination; mod pagination;
mod restrict_searchable; mod restrict_searchable;
@ -48,31 +47,6 @@ static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
]) ])
}); });
static SCORE_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Batman the dark knight returns: Part 1",
"id": "A",
},
{
"title": "Batman the dark knight returns: Part 2",
"id": "B",
},
{
"title": "Batman Returns",
"id": "C",
},
{
"title": "Batman",
"id": "D",
},
{
"title": "Badman",
"id": "E",
}
])
});
static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| { static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([ json!([
{ {
@ -301,7 +275,7 @@ async fn negative_special_cases_search() {
index.add_documents(documents, None).await; index.add_documents(documents, None).await;
index.wait_task(0).await; index.wait_task(0).await;
index.update_settings(json!({"synonyms": { "escape": ["gläss"] }})).await; index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await;
index.wait_task(1).await; index.wait_task(1).await;
// There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass // There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass
@ -985,213 +959,6 @@ async fn test_score_details() {
.await; .await;
} }
#[actix_rt::test]
async fn test_score() {
let server = Server::new().await;
let index = server.index("test");
let documents = SCORE_DOCUMENTS.clone();
let res = index.add_documents(json!(documents), None).await;
index.wait_task(res.0.uid()).await;
index
.search(
json!({
"q": "Badman the dark knight returns 1",
"showRankingScore": true,
}),
|response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "Batman the dark knight returns: Part 1",
"id": "A",
"_rankingScore": 0.9746605609456898
},
{
"title": "Batman the dark knight returns: Part 2",
"id": "B",
"_rankingScore": 0.8055252965383685
},
{
"title": "Badman",
"id": "E",
"_rankingScore": 0.16666666666666666
},
{
"title": "Batman Returns",
"id": "C",
"_rankingScore": 0.07702020202020202
},
{
"title": "Batman",
"id": "D",
"_rankingScore": 0.07702020202020202
}
]
"###);
},
)
.await;
}
#[actix_rt::test]
async fn test_score_threshold() {
let query = "Badman dark returns 1";
let server = Server::new().await;
let index = server.index("test");
let documents = SCORE_DOCUMENTS.clone();
let res = index.add_documents(json!(documents), None).await;
index.wait_task(res.0.uid()).await;
index
.search(
json!({
"q": query,
"showRankingScore": true,
"rankingScoreThreshold": 0.0
}),
|response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"5");
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "Batman the dark knight returns: Part 1",
"id": "A",
"_rankingScore": 0.93430081300813
},
{
"title": "Batman the dark knight returns: Part 2",
"id": "B",
"_rankingScore": 0.6685627880184332
},
{
"title": "Badman",
"id": "E",
"_rankingScore": 0.25
},
{
"title": "Batman Returns",
"id": "C",
"_rankingScore": 0.11553030303030302
},
{
"title": "Batman",
"id": "D",
"_rankingScore": 0.11553030303030302
}
]
"###);
},
)
.await;
index
.search(
json!({
"q": query,
"showRankingScore": true,
"rankingScoreThreshold": 0.2
}),
|response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"3"###);
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "Batman the dark knight returns: Part 1",
"id": "A",
"_rankingScore": 0.93430081300813
},
{
"title": "Batman the dark knight returns: Part 2",
"id": "B",
"_rankingScore": 0.6685627880184332
},
{
"title": "Badman",
"id": "E",
"_rankingScore": 0.25
}
]
"###);
},
)
.await;
index
.search(
json!({
"q": query,
"showRankingScore": true,
"rankingScoreThreshold": 0.5
}),
|response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"2"###);
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "Batman the dark knight returns: Part 1",
"id": "A",
"_rankingScore": 0.93430081300813
},
{
"title": "Batman the dark knight returns: Part 2",
"id": "B",
"_rankingScore": 0.6685627880184332
}
]
"###);
},
)
.await;
index
.search(
json!({
"q": query,
"showRankingScore": true,
"rankingScoreThreshold": 0.8
}),
|response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"1"###);
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "Batman the dark knight returns: Part 1",
"id": "A",
"_rankingScore": 0.93430081300813
}
]
"###);
},
)
.await;
index
.search(
json!({
"q": query,
"showRankingScore": true,
"rankingScoreThreshold": 1.0
}),
|response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"0"###);
// nobody is perfect
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @"[]");
},
)
.await;
}
#[actix_rt::test] #[actix_rt::test]
async fn test_degraded_score_details() { async fn test_degraded_score_details() {
let server = Server::new().await; let server = Server::new().await;
@ -1290,38 +1057,21 @@ async fn experimental_feature_vector_store() {
index.add_documents(json!(documents), None).await; index.add_documents(json!(documents), None).await;
index.wait_task(0).await; index.wait_task(0).await;
index let (response, code) = index
.search(json!({ .search_post(json!({
"vector": [1.0, 2.0, 3.0], "vector": [1.0, 2.0, 3.0],
"showRankingScore": true "showRankingScore": true
}), |response, code|{ }))
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
})
.await;
index
.search(json!({
"retrieveVectors": true,
"showRankingScore": true
}), |response, code|{
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
})
.await; .await;
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
let (response, code) = server.set_features(json!({"vectorStore": true})).await; let (response, code) = server.set_features(json!({"vectorStore": true})).await;
meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(code, @"200 OK");
@ -1354,7 +1104,6 @@ async fn experimental_feature_vector_store() {
.search_post(json!({ .search_post(json!({
"vector": [1.0, 2.0, 3.0], "vector": [1.0, 2.0, 3.0],
"showRankingScore": true, "showRankingScore": true,
"retrieveVectors": true,
})) }))
.await; .await;
@ -1366,16 +1115,11 @@ async fn experimental_feature_vector_store() {
"title": "Shazam!", "title": "Shazam!",
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 1.0,
[ 2.0,
1.0, 3.0
2.0, ]
3.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 1.0 "_rankingScore": 1.0
}, },
@ -1383,16 +1127,11 @@ async fn experimental_feature_vector_store() {
"title": "Captain Marvel", "title": "Captain Marvel",
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 1.0,
[ 2.0,
1.0, 54.0
2.0, ]
54.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.9129111766815186 "_rankingScore": 0.9129111766815186
}, },
@ -1400,16 +1139,11 @@ async fn experimental_feature_vector_store() {
"title": "Gläss", "title": "Gläss",
"id": "450465", "id": "450465",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ -100.0,
[ 340.0,
-100.0, 90.0
340.0, ]
90.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.8106412887573242 "_rankingScore": 0.8106412887573242
}, },
@ -1417,16 +1151,11 @@ async fn experimental_feature_vector_store() {
"title": "How to Train Your Dragon: The Hidden World", "title": "How to Train Your Dragon: The Hidden World",
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ -100.0,
[ 231.0,
-100.0, 32.0
231.0, ]
32.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.7412010431289673 "_rankingScore": 0.7412010431289673
}, },
@ -1434,16 +1163,11 @@ async fn experimental_feature_vector_store() {
"title": "Escape Room", "title": "Escape Room",
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 10.0,
[ -23.0,
10.0, 32.0
-23.0, ]
32.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.6972063183784485 "_rankingScore": 0.6972063183784485
} }

View File

@ -1,20 +0,0 @@
---
source: meilisearch/tests/search/distinct.rs
---
{
"uid": 1,
"indexUid": "tamo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"filterableAttributes": [
"color.main"
]
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,18 +0,0 @@
---
source: meilisearch/tests/search/errors.rs
---
{
"uid": 0,
"indexUid": "tamo",
"status": "succeeded",
"type": "indexCreation",
"canceledBy": null,
"details": {
"primaryKey": null
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -87,68 +87,6 @@ async fn similar_bad_id() {
"###); "###);
} }
#[actix_rt::test]
async fn similar_bad_ranking_score_threshold() {
let server = Server::new().await;
let index = server.index("test");
server.set_features(json!({"vectorStore": true})).await;
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
"filterableAttributes": ["title"]}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let (response, code) = index.similar_post(json!({"rankingScoreThreshold": ["doggo"]})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.rankingScoreThreshold`: expected a number, but found an array: `[\"doggo\"]`",
"code": "invalid_similar_ranking_score_threshold",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_ranking_score_threshold"
}
"###);
}
#[actix_rt::test]
async fn similar_invalid_ranking_score_threshold() {
let server = Server::new().await;
let index = server.index("test");
server.set_features(json!({"vectorStore": true})).await;
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
"filterableAttributes": ["title"]}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let (response, code) = index.similar_post(json!({"rankingScoreThreshold": 42})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value at `.rankingScoreThreshold`: the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`.",
"code": "invalid_similar_ranking_score_threshold",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_ranking_score_threshold"
}
"###);
}
#[actix_rt::test] #[actix_rt::test]
async fn similar_invalid_id() { async fn similar_invalid_id() {
let server = Server::new().await; let server = Server::new().await;
@ -756,54 +694,3 @@ async fn filter_reserved_geo_point_string() {
}) })
.await; .await;
} }
#[actix_rt::test]
async fn similar_bad_retrieve_vectors() {
let server = Server::new().await;
server.set_features(json!({"vectorStore": true})).await;
let index = server.index("test");
let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_get("retrieveVectors=").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_get("retrieveVectors=doggo").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
}

View File

@ -78,7 +78,7 @@ async fn basic() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar(json!({"id": 143, "retrieveVectors": true}), |response, code| { .similar(json!({"id": 143}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -87,16 +87,11 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.1,
[ 0.6,
0.10000000149011612, 0.8
0.6000000238418579, ]
0.800000011920929
]
],
"regenerate": false
}
} }
}, },
{ {
@ -104,16 +99,11 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.6,
[ 0.8,
0.6000000238418579, -0.2
0.800000011920929, ]
-0.20000000298023224
]
],
"regenerate": false
}
} }
}, },
{ {
@ -121,16 +111,11 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.7,
[ 0.7,
0.699999988079071, -0.4
0.699999988079071, ]
-0.4000000059604645
]
],
"regenerate": false
}
} }
}, },
{ {
@ -138,16 +123,11 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.8,
[ 0.4,
0.800000011920929, -0.5
0.4000000059604645, ]
-0.5
]
],
"regenerate": false
}
} }
} }
] ]
@ -156,7 +136,7 @@ async fn basic() {
.await; .await;
index index
.similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| { .similar(json!({"id": "299537"}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -165,16 +145,11 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.7,
[ 0.7,
0.699999988079071, -0.4
0.699999988079071, ]
-0.4000000059604645
]
],
"regenerate": false
}
} }
}, },
{ {
@ -182,16 +157,11 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.8,
[ 0.4,
0.800000011920929, -0.5
0.4000000059604645, ]
-0.5
]
],
"regenerate": false
}
} }
}, },
{ {
@ -199,16 +169,11 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.1,
[ 0.6,
0.10000000149011612, 0.8
0.6000000238418579, ]
0.800000011920929
]
],
"regenerate": false
}
} }
}, },
{ {
@ -216,16 +181,11 @@ async fn basic() {
"release_year": 1930, "release_year": 1930,
"id": "143", "id": "143",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ -0.5,
[ 0.3,
-0.5, 0.85
0.30000001192092896, ]
0.8500000238418579
]
],
"regenerate": false
}
} }
} }
] ]
@ -234,285 +194,6 @@ async fn basic() {
.await; .await;
} }
#[actix_rt::test]
async fn ranking_score_threshold() {
let server = Server::new().await;
let index = server.index("test");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
"filterableAttributes": ["title"]}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = DOCUMENTS.clone();
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Escape Room",
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
},
"_rankingScore": 0.890957772731781
},
{
"title": "Captain Marvel",
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
},
"_rankingScore": 0.39060014486312866
},
{
"title": "How to Train Your Dragon: The Hidden World",
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
0.699999988079071,
0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
},
"_rankingScore": 0.2819308042526245
},
{
"title": "Shazam!",
"release_year": 2019,
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
0.800000011920929,
0.4000000059604645,
-0.5
]
],
"regenerate": false
}
},
"_rankingScore": 0.1662663221359253
}
]
"###);
},
)
.await;
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Escape Room",
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
},
"_rankingScore": 0.890957772731781
},
{
"title": "Captain Marvel",
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
},
"_rankingScore": 0.39060014486312866
},
{
"title": "How to Train Your Dragon: The Hidden World",
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
0.699999988079071,
0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
},
"_rankingScore": 0.2819308042526245
}
]
"###);
},
)
.await;
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Escape Room",
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
},
"_rankingScore": 0.890957772731781
},
{
"title": "Captain Marvel",
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
},
"_rankingScore": 0.39060014486312866
}
]
"###);
},
)
.await;
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Escape Room",
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
},
"_rankingScore": 0.890957772731781
}
]
"###);
},
)
.await;
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @"[]");
},
)
.await;
}
#[actix_rt::test] #[actix_rt::test]
async fn filter() { async fn filter() {
let server = Server::new().await; let server = Server::new().await;
@ -546,97 +227,71 @@ async fn filter() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar( .similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| {
json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}), snapshot!(code, @"200 OK");
|response, code| { snapshot!(json_string!(response["hits"]), @r###"
snapshot!(code, @"200 OK"); [
snapshot!(json_string!(response["hits"]), @r###" {
[ "title": "Captain Marvel",
{ "release_year": 2019,
"title": "Captain Marvel", "id": "299537",
"release_year": 2019, "_vectors": {
"id": "299537", "manual": [
"_vectors": { 0.6,
"manual": { 0.8,
"embeddings": [ -0.2
[ ]
0.6000000238418579, }
0.800000011920929, },
-0.20000000298023224 {
] "title": "How to Train Your Dragon: The Hidden World",
], "release_year": 2019,
"regenerate": false "id": "166428",
} "_vectors": {
} "manual": [
}, 0.7,
{ 0.7,
"title": "How to Train Your Dragon: The Hidden World", -0.4
"release_year": 2019, ]
"id": "166428", }
"_vectors": { },
"manual": { {
"embeddings": [ "title": "Shazam!",
[ "release_year": 2019,
0.699999988079071, "id": "287947",
0.699999988079071, "_vectors": {
-0.4000000059604645 "manual": [
] 0.8,
], 0.4,
"regenerate": false -0.5
} ]
} }
}, }
{ ]
"title": "Shazam!", "###);
"release_year": 2019, })
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
0.800000011920929,
0.4000000059604645,
-0.5
]
],
"regenerate": false
}
}
}
]
"###);
},
)
.await; .await;
index index
.similar( .similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| {
json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}), snapshot!(code, @"200 OK");
|response, code| { snapshot!(json_string!(response["hits"]), @r###"
snapshot!(code, @"200 OK"); [
snapshot!(json_string!(response["hits"]), @r###" {
[ "title": "All Quiet on the Western Front",
{ "release_year": 1930,
"title": "All Quiet on the Western Front", "id": "143",
"release_year": 1930, "_vectors": {
"id": "143", "manual": [
"_vectors": { -0.5,
"manual": { 0.3,
"embeddings": [ 0.85
[ ]
-0.5, }
0.30000001192092896, }
0.8500000238418579 ]
] "###);
], })
"regenerate": false
}
}
}
]
"###);
},
)
.await; .await;
} }
@ -673,7 +328,7 @@ async fn limit_and_offset() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| { .similar(json!({"id": 143, "limit": 1}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -682,16 +337,11 @@ async fn limit_and_offset() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": { "manual": [
"embeddings": [ 0.1,
[ 0.6,
0.10000000149011612, 0.8
0.6000000238418579, ]
0.800000011920929
]
],
"regenerate": false
}
} }
} }
] ]
@ -700,32 +350,24 @@ async fn limit_and_offset() {
.await; .await;
index index
.similar( .similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| {
json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}), snapshot!(code, @"200 OK");
|response, code| { snapshot!(json_string!(response["hits"]), @r###"
snapshot!(code, @"200 OK"); [
snapshot!(json_string!(response["hits"]), @r###" {
[ "title": "Captain Marvel",
{ "release_year": 2019,
"title": "Captain Marvel", "id": "299537",
"release_year": 2019, "_vectors": {
"id": "299537", "manual": [
"_vectors": { 0.6,
"manual": { 0.8,
"embeddings": [ -0.2
[ ]
0.6000000238418579, }
0.800000011920929, }
-0.20000000298023224 ]
] "###);
], })
"regenerate": false
}
}
}
]
"###);
},
)
.await; .await;
} }

View File

@ -31,7 +31,6 @@ macro_rules! verify_snapshot {
} }
#[actix_rt::test] #[actix_rt::test]
#[cfg_attr(target_os = "windows", ignore)]
async fn perform_snapshot() { async fn perform_snapshot() {
let temp = tempfile::tempdir().unwrap(); let temp = tempfile::tempdir().unwrap();
let snapshot_dir = tempfile::tempdir().unwrap(); let snapshot_dir = tempfile::tempdir().unwrap();

View File

@ -1,589 +0,0 @@
mod rest;
mod settings;
use meili_snap::{json_string, snapshot};
use crate::common::index::Index;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
#[actix_rt::test]
async fn add_remove_user_provided() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
1.0,
1.0
]
],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }},
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
10.0,
10.0,
10.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (value, code) = index.delete_document(0).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 1,
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 1
}
"###);
}
async fn generate_default_user_provided_documents(server: &Server) -> Index {
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
{"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }},
{"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}},
{"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
index
}
#[actix_rt::test]
async fn user_provided_embeddings_error() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
// First case, we forget to specify the `regenerate`
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 2,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Second case, we don't specify anything
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 3,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Third case, we specify something wrong in place of regenerate
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 4,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 5,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 6,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 7,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [0.1, [0.2, 0.3]] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 10,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, 0.2], 0.3] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 11,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, true], 0.3] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 12,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_rt::test]
async fn clear_documents() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
let (value, _code) = index.clear_all_documents().await;
index.wait_task(value.uid()).await;
// Make sure the documents DB has been cleared
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [],
"offset": 0,
"limit": 20,
"total": 0
}
"###);
// Make sure the arroy DB has been cleared
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
snapshot!(documents, @r###"
{
"hits": [],
"query": "",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 0,
"semanticHitCount": 0
}
"###);
}
#[actix_rt::test]
async fn add_remove_one_vector_4588() {
// https://github.com/meilisearch/meilisearch/issues/4588
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, name: "settings-processed");
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, name: "document-added");
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, name: "document-deleted");
let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await;
snapshot!(documents, @r###"
{
"hits": [
{
"id": 0,
"name": "kefir"
}
],
"query": "",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1,
"semanticHitCount": 1
}
"###);
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 1
}
"###);
}

View File

@ -1,339 +0,0 @@
use crate::vector::GetAllDocumentsOptions;
use meili_snap::{json_string, snapshot};
use std::sync::atomic::{AtomicUsize, Ordering};
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, Request, ResponseTemplate};
use crate::common::{Server, Value};
use crate::json;
static COUNTER: AtomicUsize = AtomicUsize::new(0);
async fn create_mock() -> (MockServer, Value) {
let mock_server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/"))
.respond_with(|_req: &Request| {
let cpt = COUNTER.fetch_add(1, Ordering::Relaxed);
ResponseTemplate::new(200).set_body_json(json!({ "data": vec![cpt; 3] }))
})
.mount(&mock_server)
.await;
let url = mock_server.uri();
let embedder_settings = json!({
"source": "rest",
"url": url,
"dimensions": 3,
"query": {},
});
(mock_server, embedder_settings)
}
#[actix_rt::test]
async fn dummy_testing_the_mock() {
let (mock, _setting) = create_mock().await;
let body = reqwest::get(&mock.uri()).await.unwrap().text().await.unwrap();
snapshot!(body, @"[0,0,0]");
let body = reqwest::get(&mock.uri()).await.unwrap().text().await.unwrap();
snapshot!(body, @"[1,1,1]");
let body = reqwest::get(&mock.uri()).await.unwrap().text().await.unwrap();
snapshot!(body, @"[2,2,2]");
let body = reqwest::get(&mock.uri()).await.unwrap().text().await.unwrap();
snapshot!(body, @"[3,3,3]");
let body = reqwest::get(&mock.uri()).await.unwrap().text().await.unwrap();
snapshot!(body, @"[4,4,4]");
}
async fn get_server_vector() -> Server {
let server = Server::new().await;
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
server
}
#[actix_rt::test]
async fn bad_settings() {
let (mock, _setting) = create_mock().await;
let server = get_server_vector().await;
let index = server.index("doggo");
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": json!({ "source": "rest" }),
},
}))
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "`.embedders.rest`: Missing field `url` (note: this field is mandatory for source rest)",
"code": "invalid_settings_embedders",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": json!({ "source": "rest", "url": "kefir" }),
},
}))
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "`.embedders.rest.url`: could not parse `kefir`: relative URL without a base",
"code": "invalid_settings_embedders",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": json!({ "source": "rest", "url": mock.uri() }),
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": 0,
"indexUid": "doggo",
"status": "failed",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"rest": {
"source": "rest",
"url": "[url]"
}
}
},
"error": {
"message": "internal: Error while generating embeddings: runtime error: could not determine model dimensions: test embedding failed with user error: was expected 'input' to be an object in query 'null'.",
"code": "internal",
"type": "internal",
"link": "https://docs.meilisearch.com/errors#internal"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": json!({ "source": "rest", "url": mock.uri(), "query": {} }),
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": 1,
"indexUid": "doggo",
"status": "failed",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"rest": {
"source": "rest",
"url": "[url]",
"query": {}
}
}
},
"error": {
"message": "internal: Error while generating embeddings: runtime error: could not determine model dimensions: test embedding failed with error: component `embedding` not found in path `embedding` in response: `{\n \"data\": [\n 0,\n 0,\n 0\n ]\n}`.",
"code": "internal",
"type": "internal",
"link": "https://docs.meilisearch.com/errors#internal"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": json!({ "source": "rest", "url": mock.uri(), "query": {}, "pathToEmbeddings": ["data"] }),
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": 2,
"indexUid": "doggo",
"status": "failed",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"rest": {
"source": "rest",
"url": "[url]",
"query": {},
"pathToEmbeddings": [
"data"
]
}
}
},
"error": {
"message": "internal: Error while generating embeddings: runtime error: could not determine model dimensions: test embedding failed with error: component `embedding` not found in path `embedding` in response: `{\n \"data\": [\n 1,\n 1,\n 1\n ]\n}`.",
"code": "internal",
"type": "internal",
"link": "https://docs.meilisearch.com/errors#internal"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": json!({ "source": "rest", "url": mock.uri(), "query": {}, "embeddingObject": ["data"] }),
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": 3,
"indexUid": "doggo",
"status": "failed",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"rest": {
"source": "rest",
"url": "[url]",
"query": {},
"embeddingObject": [
"data"
]
}
}
},
"error": {
"message": "internal: Error while generating embeddings: runtime error: could not determine model dimensions: test embedding failed with error: component `data` not found in path `data` in response: `{\n \"data\": [\n 2,\n 2,\n 2\n ]\n}`.",
"code": "internal",
"type": "internal",
"link": "https://docs.meilisearch.com/errors#internal"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Validate an embedder with a bad dimension of 2 instead of 3
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": json!({ "source": "rest", "url": mock.uri(), "query": {}, "pathToEmbeddings": [], "embeddingObject": ["data"], "dimensions": 2 }),
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let (response, code) = index.add_documents(json!( { "id": 1, "name": "kefir" }), None).await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": 5,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "An unexpected crash occurred when processing the task.",
"code": "internal",
"type": "internal",
"link": "https://docs.meilisearch.com/errors#internal"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_rt::test]
async fn add_vector_and_user_provided() {
let (_mock, setting) = create_mock().await;
let server = get_server_vector().await;
let index = server.index("doggo");
let (response, code) = index
.update_settings(json!({
"embedders": {
"rest": setting,
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let documents = json!([
{"id": 0, "name": "kefir"},
{"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }},
{"id": 2, "name": "intel"},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @"");
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [],
"offset": 0,
"limit": 20,
"total": 0
}
"###);
}

View File

@ -1,228 +0,0 @@
use meili_snap::{json_string, snapshot};
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
use crate::vector::generate_default_user_provided_documents;
#[actix_rt::test]
async fn update_embedder() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": { "manual": {}},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 2,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
let ret = server.wait_task(response.uid()).await;
snapshot!(ret, @r###"
{
"uid": 1,
"indexUid": "doggo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 2
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_rt::test]
async fn reset_embedder_documents() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
let (response, code) = index.delete_settings().await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
// Make sure the documents are still present
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
limit: None,
offset: None,
retrieve_vectors: false,
fields: None,
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir"
},
{
"id": 1,
"name": "echo"
},
{
"id": 2,
"name": "billou"
},
{
"id": 3,
"name": "intel"
},
{
"id": 4,
"name": "max"
}
],
"offset": 0,
"limit": 20,
"total": 5
}
"###);
// Make sure we are still able to retrieve their vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
1.0,
1.0
]
],
"regenerate": false
}
}
},
{
"id": 2,
"name": "billou",
"_vectors": {
"manual": {
"embeddings": [
[
2.0,
2.0,
2.0
],
[
2.0,
2.0,
3.0
]
],
"regenerate": false
}
}
},
{
"id": 3,
"name": "intel",
"_vectors": {
"manual": {
"embeddings": [
[
3.0,
3.0,
3.0
]
],
"regenerate": false
}
}
},
{
"id": 4,
"name": "max",
"_vectors": {
"manual": {
"embeddings": [
[
4.0,
4.0,
4.0
],
[
4.0,
4.0,
5.0
]
],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 5
}
"###);
// Make sure the arroy DB has been cleared
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
snapshot!(json_string!(documents), @r###"
{
"message": "Cannot find embedder with name `default`.",
"code": "invalid_embedder",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_embedder"
}
"###);
}

View File

@ -1,19 +0,0 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 1,
"indexUid": "doggo",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,19 +0,0 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 2,
"indexUid": "doggo",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,23 +0,0 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 0,
"indexUid": "doggo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -17,7 +17,7 @@ bincode = "1.3.3"
bstr = "1.9.0" bstr = "1.9.0"
bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
byteorder = "1.5.0" byteorder = "1.5.0"
charabia = { version = "0.8.11", default-features = false } charabia = { version = "0.8.10", default-features = false }
concat-arrays = "0.1.2" concat-arrays = "0.1.2"
crossbeam-channel = "0.5.11" crossbeam-channel = "0.5.11"
deserr = "0.6.1" deserr = "0.6.1"
@ -44,7 +44,7 @@ once_cell = "1.19.0"
ordered-float = "4.2.0" ordered-float = "4.2.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] } rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.8.0" rayon = "1.8.0"
roaring = { version = "0.10.2", features = ["serde"] } roaring = "0.10.2"
rstar = { version = "0.11.0", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] }
serde = { version = "1.0.195", features = ["derive"] } serde = { version = "1.0.195", features = ["derive"] }
serde_json = { version = "1.0.111", features = ["preserve_order"] } serde_json = { version = "1.0.111", features = ["preserve_order"] }
@ -71,15 +71,15 @@ csv = "1.3.0"
candle-core = { version = "0.4.1" } candle-core = { version = "0.4.1" }
candle-transformers = { version = "0.4.1" } candle-transformers = { version = "0.4.1" }
candle-nn = { version = "0.4.1" } candle-nn = { version = "0.4.1" }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [ tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [
"onig", "onig",
] } ] }
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
"online", "online",
] } ] }
tiktoken-rs = "0.5.8" tiktoken-rs = "0.5.8"
liquid = "0.26.4" liquid = "0.26.4"
arroy = "0.4.0" arroy = "0.3.1"
rand = "0.8.5" rand = "0.8.5"
tracing = "0.1.40" tracing = "0.1.40"
ureq = { version = "2.9.7", features = ["json"] } ureq = { version = "2.9.7", features = ["json"] }

View File

@ -59,7 +59,6 @@ fn main() -> Result<(), Box<dyn Error>> {
false, false,
universe, universe,
&None, &None,
&None,
GeoSortStrategy::default(), GeoSortStrategy::default(),
0, 0,
20, 20,
@ -67,7 +66,6 @@ fn main() -> Result<(), Box<dyn Error>> {
&mut DefaultSearchLogger, &mut DefaultSearchLogger,
logger, logger,
TimeBudget::max(), TimeBudget::max(),
None,
)?; )?;
if let Some((logger, dir)) = detailed_logger { if let Some((logger, dir)) = detailed_logger {
logger.finish(&mut ctx, Path::new(dir))?; logger.finish(&mut ctx, Path::new(dir))?;

View File

@ -119,8 +119,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
InvalidVectorDimensions { expected: usize, found: usize }, InvalidVectorDimensions { expected: usize, found: usize },
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
InvalidVectorsMapType { document_id: String, value: Value }, InvalidVectorsMapType { document_id: String, value: Value },
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError },
#[error("{0}")] #[error("{0}")]
InvalidFilter(String), InvalidFilter(String),
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
@ -136,17 +134,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
} }
)] )]
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool }, InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
#[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}",
.field,
match .valid_fields.is_empty() {
true => "This index does not have configured filterable attributes.".to_string(),
false => format!("Available filterable attributes are: `{}{}`.",
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
),
}
)]
InvalidDistinctAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
#[error("Attribute `{}` is not facet-searchable. {}", #[error("Attribute `{}` is not facet-searchable. {}",
.field, .field,
match .valid_fields.is_empty() { match .valid_fields.is_empty() {
@ -283,9 +270,8 @@ impl From<arroy::Error> for Error {
arroy::Error::DatabaseFull arroy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend | arroy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. } | arroy::Error::UnmatchingDistance { .. }
| arroy::Error::NeedBuild(_) | arroy::Error::MissingNode
| arroy::Error::MissingKey { .. } | arroy::Error::MissingMetadata => {
| arroy::Error::MissingMetadata(_) => {
Error::InternalError(InternalError::ArroyError(value)) Error::InternalError(InternalError::ArroyError(value))
} }
} }

View File

@ -4,7 +4,6 @@ use std::collections::HashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::{FieldId, FieldsIdsMap, Weight}; use crate::{FieldId, FieldsIdsMap, Weight};
#[derive(Debug, Default, Serialize, Deserialize)] #[derive(Debug, Default, Serialize, Deserialize)]
@ -24,13 +23,7 @@ impl FieldidsWeightsMap {
/// Should only be called in the case there are NO searchable attributes. /// Should only be called in the case there are NO searchable attributes.
/// All the fields will be inserted in the order of the fields ids map with a weight of 0. /// All the fields will be inserted in the order of the fields ids map with a weight of 0.
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
FieldidsWeightsMap { FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() }
map: fid_map
.iter()
.filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
.map(|(fid, _name)| (fid, 0))
.collect(),
}
} }
/// Removes a field id from the map, returning the associated weight previously in the map. /// Removes a field id from the map, returning the associated weight previously in the map.

View File

@ -41,16 +41,6 @@ impl FieldsIdsMap {
} }
} }
/// Get the ids of a field and all its nested fields based on its name.
pub fn nested_ids(&self, name: &str) -> Vec<FieldId> {
self.names_ids
.range(name.to_string()..)
.take_while(|(key, _)| key.starts_with(name))
.filter(|(key, _)| crate::is_faceted_by(key, name))
.map(|(_name, id)| *id)
.collect()
}
/// Get the id of a field based on its name. /// Get the id of a field based on its name.
pub fn id(&self, name: &str) -> Option<FieldId> { pub fn id(&self, name: &str) -> Option<FieldId> {
self.names_ids.get(name).copied() self.names_ids.get(name).copied()
@ -136,32 +126,4 @@ mod tests {
assert_eq!(iter.next(), Some((3, "title"))); assert_eq!(iter.next(), Some((3, "title")));
assert_eq!(iter.next(), None); assert_eq!(iter.next(), None);
} }
#[test]
fn nested_fields() {
let mut map = FieldsIdsMap::new();
assert_eq!(map.insert("id"), Some(0));
assert_eq!(map.insert("doggo"), Some(1));
assert_eq!(map.insert("doggo.name"), Some(2));
assert_eq!(map.insert("doggolution"), Some(3));
assert_eq!(map.insert("doggo.breed.name"), Some(4));
assert_eq!(map.insert("description"), Some(5));
insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###"
[
1,
4,
2,
]
"###);
insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###"
[
4,
]
"###);
insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]");
}
} }

View File

@ -47,12 +47,6 @@ pub struct FacetGroupValue {
pub bitmap: RoaringBitmap, pub bitmap: RoaringBitmap,
} }
#[derive(Debug)]
pub struct FacetGroupLazyValue<'b> {
pub size: u8,
pub bitmap_bytes: &'b [u8],
}
pub struct FacetGroupKeyCodec<T> { pub struct FacetGroupKeyCodec<T> {
_phantom: PhantomData<T>, _phantom: PhantomData<T>,
} }
@ -75,7 +69,6 @@ where
Ok(Cow::Owned(v)) Ok(Cow::Owned(v))
} }
} }
impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T> impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T>
where where
T: BytesDecode<'a>, T: BytesDecode<'a>,
@ -91,7 +84,6 @@ where
} }
pub struct FacetGroupValueCodec; pub struct FacetGroupValueCodec;
impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
type EItem = FacetGroupValue; type EItem = FacetGroupValue;
@ -101,23 +93,11 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
Ok(Cow::Owned(v)) Ok(Cow::Owned(v))
} }
} }
impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
type DItem = FacetGroupValue; type DItem = FacetGroupValue;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let size = bytes[0]; let size = bytes[0];
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?; let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
Ok(FacetGroupValue { size, bitmap }) Ok(FacetGroupValue { size, bitmap })
} }
} }
pub struct FacetGroupLazyValueCodec;
impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec {
type DItem = FacetGroupLazyValue<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] })
}
}

View File

@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::io::{self, Cursor}; use std::io;
use std::mem::size_of; use std::mem::size_of;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
@ -57,24 +57,6 @@ impl CboRoaringBitmapCodec {
} }
} }
pub fn intersection_with_serialized(
mut bytes: &[u8],
other: &RoaringBitmap,
) -> io::Result<RoaringBitmap> {
// See above `deserialize_from` method for implementation details.
if bytes.len() <= THRESHOLD * size_of::<u32>() {
let mut bitmap = RoaringBitmap::new();
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
if other.contains(integer) {
bitmap.insert(integer);
}
}
Ok(bitmap)
} else {
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
}
}
/// Merge serialized CboRoaringBitmaps in a buffer. /// Merge serialized CboRoaringBitmaps in a buffer.
/// ///
/// if the merged values length is under the threshold, values are directly /// if the merged values length is under the threshold, values are directly

View File

@ -9,7 +9,6 @@ use heed::types::*;
use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use rstar::RTree; use rstar::RTree;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime; use time::OffsetDateTime;
use crate::documents::PrimaryKey; use crate::documents::PrimaryKey;
@ -24,7 +23,6 @@ use crate::heed_codec::{
}; };
use crate::order_by_map::OrderByMap; use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::vector::{Embedding, EmbeddingConfig}; use crate::vector::{Embedding, EmbeddingConfig};
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -646,7 +644,6 @@ impl Index {
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
user_fields: &[&str], user_fields: &[&str],
non_searchable_fields_ids: &[FieldId],
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
) -> Result<()> { ) -> Result<()> {
// We can write the user defined searchable fields as-is. // We can write the user defined searchable fields as-is.
@ -665,7 +662,6 @@ impl Index {
for (weight, user_field) in user_fields.iter().enumerate() { for (weight, user_field) in user_fields.iter().enumerate() {
if crate::is_faceted_by(field_from_map, user_field) if crate::is_faceted_by(field_from_map, user_field)
&& !real_fields.contains(&field_from_map) && !real_fields.contains(&field_from_map)
&& !non_searchable_fields_ids.contains(&id)
{ {
real_fields.push(field_from_map); real_fields.push(field_from_map);
@ -712,7 +708,6 @@ impl Index {
Ok(self Ok(self
.fields_ids_map(rtxn)? .fields_ids_map(rtxn)?
.names() .names()
.filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
.map(|field| Cow::Owned(field.to_string())) .map(|field| Cow::Owned(field.to_string()))
.collect()) .collect())
}) })
@ -1573,16 +1568,12 @@ impl Index {
Ok(script_language) Ok(script_language)
} }
/// Put the embedding configs:
/// 1. The name of the embedder
/// 2. The configuration option for this embedder
/// 3. The list of documents with a user provided embedding
pub(crate) fn put_embedding_configs( pub(crate) fn put_embedding_configs(
&self, &self,
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
configs: Vec<IndexEmbeddingConfig>, configs: Vec<(String, EmbeddingConfig)>,
) -> heed::Result<()> { ) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>().put( self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put(
wtxn, wtxn,
main_key::EMBEDDING_CONFIGS, main_key::EMBEDDING_CONFIGS,
&configs, &configs,
@ -1593,10 +1584,13 @@ impl Index {
self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS) self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS)
} }
pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result<Vec<IndexEmbeddingConfig>> { pub fn embedding_configs(
&self,
rtxn: &RoTxn<'_>,
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
Ok(self Ok(self
.main .main
.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>() .remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>()
.get(rtxn, main_key::EMBEDDING_CONFIGS)? .get(rtxn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default()) .unwrap_or_default())
} }
@ -1610,7 +1604,7 @@ impl Index {
arroy::Reader::open(rtxn, k, self.vector_arroy) arroy::Reader::open(rtxn, k, self.vector_arroy)
.map(Some) .map(Some)
.or_else(|e| match e { .or_else(|e| match e {
arroy::Error::MissingMetadata(_) => Ok(None), arroy::Error::MissingMetadata => Ok(None),
e => Err(e.into()), e => Err(e.into()),
}) })
.transpose() .transpose()
@ -1643,7 +1637,7 @@ impl Index {
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy) let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
.map(Some) .map(Some)
.or_else(|e| match e { .or_else(|e| match e {
arroy::Error::MissingMetadata(_) => Ok(None), arroy::Error::MissingMetadata => Ok(None),
e => Err(e), e => Err(e),
}) })
.transpose(); .transpose();
@ -1668,13 +1662,6 @@ impl Index {
} }
} }
#[derive(Debug, Deserialize, Serialize)]
pub struct IndexEmbeddingConfig {
pub name: String,
pub config: EmbeddingConfig,
pub user_provided: RoaringBitmap,
}
#[cfg(test)] #[cfg(test)]
pub(crate) mod tests { pub(crate) mod tests {
use std::collections::HashSet; use std::collections::HashSet;
@ -1682,17 +1669,15 @@ pub(crate) mod tests {
use big_s::S; use big_s::S;
use heed::{EnvOpenOptions, RwTxn}; use heed::{EnvOpenOptions, RwTxn};
use maplit::{btreemap, hashset}; use maplit::hashset;
use tempfile::TempDir; use tempfile::TempDir;
use crate::documents::DocumentsBatchReader; use crate::documents::DocumentsBatchReader;
use crate::error::{Error, InternalError}; use crate::error::{Error, InternalError};
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::{ use crate::update::{
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
Settings,
}; };
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
pub(crate) struct TempIndex { pub(crate) struct TempIndex {
@ -2798,95 +2783,4 @@ pub(crate) mod tests {
] ]
"###); "###);
} }
#[test]
fn vectors_are_never_indexed_as_searchable_or_filterable() {
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 0, "_vectors": { "doggo": [2345] } },
{ "id": 1, "_vectors": { "doggo": [6789] } },
]))
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @r###"["id"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
drop(rtxn);
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]);
settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]);
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
index
.update_settings(|settings| {
settings.set_embedder_settings(btreemap! {
S("doggo") => Setting::Set(EmbeddingSettings {
dimensions: Setting::Set(1),
source: Setting::Set(EmbedderSource::UserProvided),
..EmbeddingSettings::default()}),
});
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
}
} }

View File

@ -6,11 +6,9 @@ use heed::Result;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level}; use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec; use crate::heed_codec::BytesRefCodec;
use crate::{CboRoaringBitmapCodec, DocumentId}; use crate::DocumentId;
/// Call the given closure on the facet distribution of the candidate documents. /// Call the given closure on the facet distribution of the candidate documents.
/// ///
@ -33,11 +31,14 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
where where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>, CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{ {
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback }; let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
Ok(()) Ok(())
} else { } else {
@ -74,10 +75,13 @@ where
// Represents the list of keys that we must explore. // Represents the list of keys that we must explore.
let mut heap = BinaryHeap::new(); let mut heap = BinaryHeap::new();
let db = db.remap_data_type::<FacetGroupLazyValueCodec>(); let highest_level = get_highest_level(
let highest_level = get_highest_level(rtxn, db, field_id)?; rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level // We first fill the heap with values from the highest level
let starting_key = let starting_key =
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
@ -88,10 +92,7 @@ where
if key.field_id != field_id { if key.field_id != field_id {
break; break;
} }
let intersection = CboRoaringBitmapCodec::intersection_with_serialized( let intersection = value.bitmap & candidates;
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len(); let count = intersection.len();
if count != 0 { if count != 0 {
heap.push(LevelEntry { heap.push(LevelEntry {
@ -120,10 +121,7 @@ where
if key.field_id != field_id { if key.field_id != field_id {
break; break;
} }
let intersection = CboRoaringBitmapCodec::intersection_with_serialized( let intersection = value.bitmap & candidates;
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len(); let count = intersection.len();
if count != 0 { if count != 0 {
heap.push(LevelEntry { heap.push(LevelEntry {
@ -148,7 +146,7 @@ where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>, CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{ {
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
callback: CB, callback: CB,
} }
@ -173,10 +171,7 @@ where
if key.field_id != self.field_id { if key.field_id != self.field_id {
return Ok(ControlFlow::Break(())); return Ok(ControlFlow::Break(()));
} }
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized( let docids_in_common = value.bitmap & candidates;
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() { if !docids_in_common.is_empty() {
let any_docid_in_common = docids_in_common.min().unwrap(); let any_docid_in_common = docids_in_common.min().unwrap();
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)? match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
@ -210,10 +205,7 @@ where
if key.field_id != self.field_id { if key.field_id != self.field_id {
return Ok(ControlFlow::Break(())); return Ok(ControlFlow::Break(()));
} }
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized( let docids_in_common = value.bitmap & candidates;
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() { if !docids_in_common.is_empty() {
let cf = self.iterate( let cf = self.iterate(
&docids_in_common, &docids_in_common,

View File

@ -4,11 +4,9 @@ use heed::BytesEncode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
use crate::heed_codec::facet::{ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec; use crate::heed_codec::BytesRefCodec;
use crate::{CboRoaringBitmapCodec, Result}; use crate::Result;
/// Find all the document ids for which the given field contains a value contained within /// Find all the document ids for which the given field contains a value contained within
/// the two bounds. /// the two bounds.
@ -18,7 +16,6 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
field_id: u16, field_id: u16,
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>, left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>, right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
universe: Option<&RoaringBitmap>,
docids: &mut RoaringBitmap, docids: &mut RoaringBitmap,
) -> Result<()> ) -> Result<()>
where where
@ -49,15 +46,13 @@ where
} }
Bound::Unbounded => Bound::Unbounded, Bound::Unbounded => Bound::Unbounded,
}; };
let db = db.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>(); let db = db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids };
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(starting_left_bound) = if let Some(starting_left_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?
{
let rightmost_bound = let rightmost_bound =
Bound::Included(get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded Bound::Included(get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
let group_size = usize::MAX; let group_size = usize::MAX;
f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?;
Ok(()) Ok(())
@ -69,16 +64,12 @@ where
/// Fetch the document ids that have a facet with a value between the two given bounds /// Fetch the document ids that have a facet with a value between the two given bounds
struct FacetRangeSearch<'t, 'b, 'bitmap> { struct FacetRangeSearch<'t, 'b, 'bitmap> {
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
left: Bound<&'b [u8]>, left: Bound<&'b [u8]>,
right: Bound<&'b [u8]>, right: Bound<&'b [u8]>,
/// The subset of documents ids that are useful for this search.
/// Great performance optimizations can be achieved by only fetching values matching this subset.
universe: Option<&'bitmap RoaringBitmap>,
docids: &'bitmap mut RoaringBitmap, docids: &'bitmap mut RoaringBitmap,
} }
impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> {
let left_key = let left_key =
@ -113,13 +104,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
} }
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
*self.docids |= match self.universe { *self.docids |= value.bitmap;
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
};
} }
} }
Ok(()) Ok(())
@ -210,13 +195,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
left_condition && right_condition left_condition && right_condition
}; };
if should_take_whole_group { if should_take_whole_group {
*self.docids |= match self.universe { *self.docids |= &previous_value.bitmap;
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
previous_key = next_key; previous_key = next_key;
previous_value = next_value; previous_value = next_value;
continue; continue;
@ -312,13 +291,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
left_condition && right_condition left_condition && right_condition
}; };
if should_take_whole_group { if should_take_whole_group {
*self.docids |= match self.universe { *self.docids |= &previous_value.bitmap;
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
} else { } else {
let level = level - 1; let level = level - 1;
let starting_left_bound = previous_key.left_bound; let starting_left_bound = previous_key.left_bound;
@ -392,7 +365,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -412,7 +384,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -447,7 +418,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -469,7 +439,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -505,7 +474,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -531,7 +499,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -570,7 +537,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -590,7 +556,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -606,7 +571,6 @@ mod tests {
0, 0,
&Bound::Unbounded, &Bound::Unbounded,
&Bound::Unbounded, &Bound::Unbounded,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -622,7 +586,6 @@ mod tests {
1, 1,
&Bound::Unbounded, &Bound::Unbounded,
&Bound::Unbounded, &Bound::Unbounded,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -658,7 +621,6 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -672,7 +634,6 @@ mod tests {
1, 1,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();

View File

@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>(
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> { ) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);

View File

@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>(
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> { ) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let last_bound = get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap(); let last_bound = get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap();
let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound };
let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
Ok(itertools::Either::Left(DescendingFacetSort { Ok(itertools::Either::Left(DescendingFacetSort {

View File

@ -4,7 +4,7 @@ use std::ops::Bound::{self, Excluded, Included};
use either::Either; use either::Either;
pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token}; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token};
use roaring::{MultiOps, RoaringBitmap}; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use super::facet_range_search; use super::facet_range_search;
@ -224,14 +224,14 @@ impl<'a> Filter<'a> {
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time // to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let filterable_fields = index.filterable_fields(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?;
self.inner_evaluate(rtxn, index, &filterable_fields, None)
self.inner_evaluate(rtxn, index, &filterable_fields)
} }
fn evaluate_operator( fn evaluate_operator(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
field_id: FieldId, field_id: FieldId,
universe: Option<&RoaringBitmap>,
operator: &Condition<'a>, operator: &Condition<'a>,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids; let numbers_db = index.facet_id_f64_docids;
@ -291,22 +291,14 @@ impl<'a> Filter<'a> {
} }
Condition::NotEqual(val) => { Condition::NotEqual(val) => {
let operator = Condition::Equal(val.clone()); let operator = Condition::Equal(val.clone());
let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?;
let all_ids = index.documents_ids(rtxn)?; let all_ids = index.documents_ids(rtxn)?;
return Ok(all_ids - docids); return Ok(all_ids - docids);
} }
}; };
let mut output = RoaringBitmap::new(); let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels( Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?;
rtxn,
numbers_db,
field_id,
left,
right,
universe,
&mut output,
)?;
Ok(output) Ok(output)
} }
@ -318,7 +310,6 @@ impl<'a> Filter<'a> {
field_id: FieldId, field_id: FieldId,
left: Bound<f64>, left: Bound<f64>,
right: Bound<f64>, right: Bound<f64>,
universe: Option<&RoaringBitmap>,
output: &mut RoaringBitmap, output: &mut RoaringBitmap,
) -> Result<()> { ) -> Result<()> {
match (left, right) { match (left, right) {
@ -330,7 +321,7 @@ impl<'a> Filter<'a> {
(_, _) => (), (_, _) => (),
} }
facet_range_search::find_docids_of_facet_within_bounds::<OrderedF64Codec>( facet_range_search::find_docids_of_facet_within_bounds::<OrderedF64Codec>(
rtxn, db, field_id, &left, &right, universe, output, rtxn, db, field_id, &left, &right, output,
)?; )?;
Ok(()) Ok(())
@ -341,37 +332,31 @@ impl<'a> Filter<'a> {
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
filterable_fields: &HashSet<String>, filterable_fields: &HashSet<String>,
universe: Option<&RoaringBitmap>,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
if universe.map_or(false, |u| u.is_empty()) {
return Ok(RoaringBitmap::new());
}
match &self.condition { match &self.condition {
FilterCondition::Not(f) => { FilterCondition::Not(f) => {
let all_ids = index.documents_ids(rtxn)?;
let selected = Self::inner_evaluate( let selected = Self::inner_evaluate(
&(f.as_ref().clone()).into(), &(f.as_ref().clone()).into(),
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
match universe { Ok(all_ids - selected)
Some(universe) => Ok(universe - selected),
None => {
let all_ids = index.documents_ids(rtxn)?;
Ok(all_ids - selected)
}
}
} }
FilterCondition::In { fid, els } => { FilterCondition::In { fid, els } => {
if crate::is_faceted(fid.value(), filterable_fields) { if crate::is_faceted(fid.value(), filterable_fields) {
let field_ids_map = index.fields_ids_map(rtxn)?; let field_ids_map = index.fields_ids_map(rtxn)?;
if let Some(fid) = field_ids_map.id(fid.value()) { if let Some(fid) = field_ids_map.id(fid.value()) {
els.iter() let mut bitmap = RoaringBitmap::new();
.map(|el| Condition::Equal(el.clone()))
.map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) for el in els {
.union() let op = Condition::Equal(el.clone());
let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?;
bitmap |= el_bitmap;
}
Ok(bitmap)
} else { } else {
Ok(RoaringBitmap::new()) Ok(RoaringBitmap::new())
} }
@ -386,7 +371,7 @@ impl<'a> Filter<'a> {
if crate::is_faceted(fid.value(), filterable_fields) { if crate::is_faceted(fid.value(), filterable_fields) {
let field_ids_map = index.fields_ids_map(rtxn)?; let field_ids_map = index.fields_ids_map(rtxn)?;
if let Some(fid) = field_ids_map.id(fid.value()) { if let Some(fid) = field_ids_map.id(fid.value()) {
Self::evaluate_operator(rtxn, index, fid, universe, op) Self::evaluate_operator(rtxn, index, fid, op)
} else { } else {
Ok(RoaringBitmap::new()) Ok(RoaringBitmap::new())
} }
@ -397,11 +382,14 @@ impl<'a> Filter<'a> {
}))? }))?
} }
} }
FilterCondition::Or(subfilters) => subfilters FilterCondition::Or(subfilters) => {
.iter() let mut bitmap = RoaringBitmap::new();
.cloned() for f in subfilters {
.map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) bitmap |=
.union(), Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?;
}
Ok(bitmap)
}
FilterCondition::And(subfilters) => { FilterCondition::And(subfilters) => {
let mut subfilters_iter = subfilters.iter(); let mut subfilters_iter = subfilters.iter();
if let Some(first_subfilter) = subfilters_iter.next() { if let Some(first_subfilter) = subfilters_iter.next() {
@ -410,21 +398,16 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
for f in subfilters_iter { for f in subfilters_iter {
if bitmap.is_empty() { if bitmap.is_empty() {
return Ok(bitmap); return Ok(bitmap);
} }
// TODO We are doing the intersections two times,
// it could be more efficient
// Can't I just replace this `&=` by an `=`?
bitmap &= Self::inner_evaluate( bitmap &= Self::inner_evaluate(
&(f.clone()).into(), &(f.clone()).into(),
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
Some(&bitmap),
)?; )?;
} }
Ok(bitmap) Ok(bitmap)
@ -524,7 +507,6 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
let geo_lng_token = Token::new( let geo_lng_token = Token::new(
@ -557,7 +539,6 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
let condition_right = FilterCondition::Condition { let condition_right = FilterCondition::Condition {
@ -571,7 +552,6 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
left | right left | right
@ -587,7 +567,6 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)? )?
}; };

View File

@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::filter::{BadGeoError, Filter}; pub use self::filter::{BadGeoError, Filter};
pub use self::search::{FacetValueHit, SearchForFacetValues}; pub use self::search::{FacetValueHit, SearchForFacetValues};
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec};
use crate::heed_codec::BytesRefCodec; use crate::heed_codec::BytesRefCodec;
use crate::{Index, Result}; use crate::{Index, Result};
@ -54,9 +54,9 @@ pub fn facet_max_value<'t>(
} }
/// Get the first facet value in the facet database /// Get the first facet value in the facet database
pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>( pub(crate) fn get_first_facet_value<'t, BoundCodec>(
txn: &'t RoTxn, txn: &'t RoTxn,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>> ) -> heed::Result<Option<BoundCodec::DItem>>
where where
@ -78,9 +78,9 @@ where
} }
/// Get the last facet value in the facet database /// Get the last facet value in the facet database
pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>( pub(crate) fn get_last_facet_value<'t, BoundCodec>(
txn: &'t RoTxn, txn: &'t RoTxn,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>> ) -> heed::Result<Option<BoundCodec::DItem>>
where where
@ -102,9 +102,9 @@ where
} }
/// Get the height of the highest level in the facet database /// Get the height of the highest level in the facet database
pub(crate) fn get_highest_level<'t, DC>( pub(crate) fn get_highest_level<'t>(
txn: &'t RoTxn<'t>, txn: &'t RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
) -> heed::Result<u8> { ) -> heed::Result<u8> {
let field_id_prefix = &field_id.to_be_bytes(); let field_id_prefix = &field_id.to_be_bytes();

View File

@ -159,7 +159,6 @@ impl<'a> Search<'a> {
offset: 0, offset: 0,
limit: self.limit + self.offset, limit: self.limit + self.offset,
sort_criteria: self.sort_criteria.clone(), sort_criteria: self.sort_criteria.clone(),
distinct: self.distinct.clone(),
searchable_attributes: self.searchable_attributes, searchable_attributes: self.searchable_attributes,
geo_strategy: self.geo_strategy, geo_strategy: self.geo_strategy,
terms_matching_strategy: self.terms_matching_strategy, terms_matching_strategy: self.terms_matching_strategy,
@ -170,7 +169,6 @@ impl<'a> Search<'a> {
index: self.index, index: self.index,
semantic: self.semantic.clone(), semantic: self.semantic.clone(),
time_budget: self.time_budget.clone(), time_budget: self.time_budget.clone(),
ranking_score_threshold: self.ranking_score_threshold,
}; };
let semantic = search.semantic.take(); let semantic = search.semantic.take();

View File

@ -11,8 +11,8 @@ use self::new::{execute_vector_search, PartialSearchResult};
use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::vector::Embedder; use crate::vector::Embedder;
use crate::{ use crate::{
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index, execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result,
Result, SearchContext, TimeBudget, UserError, SearchContext, TimeBudget,
}; };
// Building these factories is not free. // Building these factories is not free.
@ -40,7 +40,6 @@ pub struct Search<'a> {
offset: usize, offset: usize,
limit: usize, limit: usize,
sort_criteria: Option<Vec<AscDesc>>, sort_criteria: Option<Vec<AscDesc>>,
distinct: Option<String>,
searchable_attributes: Option<&'a [String]>, searchable_attributes: Option<&'a [String]>,
geo_strategy: new::GeoSortStrategy, geo_strategy: new::GeoSortStrategy,
terms_matching_strategy: TermsMatchingStrategy, terms_matching_strategy: TermsMatchingStrategy,
@ -51,7 +50,6 @@ pub struct Search<'a> {
index: &'a Index, index: &'a Index,
semantic: Option<SemanticSearch>, semantic: Option<SemanticSearch>,
time_budget: TimeBudget, time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
} }
impl<'a> Search<'a> { impl<'a> Search<'a> {
@ -62,7 +60,6 @@ impl<'a> Search<'a> {
offset: 0, offset: 0,
limit: 20, limit: 20,
sort_criteria: None, sort_criteria: None,
distinct: None,
searchable_attributes: None, searchable_attributes: None,
geo_strategy: new::GeoSortStrategy::default(), geo_strategy: new::GeoSortStrategy::default(),
terms_matching_strategy: TermsMatchingStrategy::default(), terms_matching_strategy: TermsMatchingStrategy::default(),
@ -73,7 +70,6 @@ impl<'a> Search<'a> {
index, index,
semantic: None, semantic: None,
time_budget: TimeBudget::max(), time_budget: TimeBudget::max(),
ranking_score_threshold: None,
} }
} }
@ -107,11 +103,6 @@ impl<'a> Search<'a> {
self self
} }
pub fn distinct(&mut self, distinct: String) -> &mut Search<'a> {
self.distinct = Some(distinct);
self
}
pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> { pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> {
self.searchable_attributes = Some(searchable); self.searchable_attributes = Some(searchable);
self self
@ -155,11 +146,6 @@ impl<'a> Search<'a> {
self self
} }
pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Search<'a> {
self.ranking_score_threshold = Some(ranking_score_threshold);
self
}
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> { pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
if has_vector_search { if has_vector_search {
let ctx = SearchContext::new(self.index, self.rtxn)?; let ctx = SearchContext::new(self.index, self.rtxn)?;
@ -176,19 +162,6 @@ impl<'a> Search<'a> {
ctx.attributes_to_search_on(searchable_attributes)?; ctx.attributes_to_search_on(searchable_attributes)?;
} }
if let Some(distinct) = &self.distinct {
let filterable_fields = ctx.index.filterable_fields(ctx.txn)?;
if !crate::is_faceted(distinct, &filterable_fields) {
let (valid_fields, hidden_fields) =
ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?;
return Err(Error::UserError(UserError::InvalidDistinctAttribute {
field: distinct.clone(),
valid_fields,
hidden_fields,
}));
}
}
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?; let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
let PartialSearchResult { let PartialSearchResult {
located_query_terms, located_query_terms,
@ -205,14 +178,12 @@ impl<'a> Search<'a> {
self.scoring_strategy, self.scoring_strategy,
universe, universe,
&self.sort_criteria, &self.sort_criteria,
&self.distinct,
self.geo_strategy, self.geo_strategy,
self.offset, self.offset,
self.limit, self.limit,
embedder_name, embedder_name,
embedder, embedder,
self.time_budget.clone(), self.time_budget.clone(),
self.ranking_score_threshold,
)? )?
} }
_ => execute_search( _ => execute_search(
@ -223,7 +194,6 @@ impl<'a> Search<'a> {
self.exhaustive_number_hits, self.exhaustive_number_hits,
universe, universe,
&self.sort_criteria, &self.sort_criteria,
&self.distinct,
self.geo_strategy, self.geo_strategy,
self.offset, self.offset,
self.limit, self.limit,
@ -231,7 +201,6 @@ impl<'a> Search<'a> {
&mut DefaultSearchLogger, &mut DefaultSearchLogger,
&mut DefaultSearchLogger, &mut DefaultSearchLogger,
self.time_budget.clone(), self.time_budget.clone(),
self.ranking_score_threshold,
)?, )?,
}; };
@ -260,7 +229,6 @@ impl fmt::Debug for Search<'_> {
offset, offset,
limit, limit,
sort_criteria, sort_criteria,
distinct,
searchable_attributes, searchable_attributes,
geo_strategy: _, geo_strategy: _,
terms_matching_strategy, terms_matching_strategy,
@ -271,7 +239,6 @@ impl fmt::Debug for Search<'_> {
index: _, index: _,
semantic, semantic,
time_budget, time_budget,
ranking_score_threshold,
} = self; } = self;
f.debug_struct("Search") f.debug_struct("Search")
.field("query", query) .field("query", query)
@ -280,7 +247,6 @@ impl fmt::Debug for Search<'_> {
.field("offset", offset) .field("offset", offset)
.field("limit", limit) .field("limit", limit)
.field("sort_criteria", sort_criteria) .field("sort_criteria", sort_criteria)
.field("distinct", distinct)
.field("searchable_attributes", searchable_attributes) .field("searchable_attributes", searchable_attributes)
.field("terms_matching_strategy", terms_matching_strategy) .field("terms_matching_strategy", terms_matching_strategy)
.field("scoring_strategy", scoring_strategy) .field("scoring_strategy", scoring_strategy)
@ -291,7 +257,6 @@ impl fmt::Debug for Search<'_> {
&semantic.as_ref().map(|semantic| &semantic.embedder_name), &semantic.as_ref().map(|semantic| &semantic.embedder_name),
) )
.field("time_budget", time_budget) .field("time_budget", time_budget)
.field("ranking_score_threshold", ranking_score_threshold)
.finish() .finish()
} }
} }
@ -312,8 +277,6 @@ pub enum TermsMatchingStrategy {
Last, Last,
// all words are mandatory // all words are mandatory
All, All,
// remove more frequent word first
Frequency,
} }
impl Default for TermsMatchingStrategy { impl Default for TermsMatchingStrategy {

View File

@ -22,25 +22,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>, mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
query: &Q, query: &Q,
distinct: Option<&str>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
from: usize, from: usize,
length: usize, length: usize,
scoring_strategy: ScoringStrategy, scoring_strategy: ScoringStrategy,
logger: &mut dyn SearchLogger<Q>, logger: &mut dyn SearchLogger<Q>,
time_budget: TimeBudget, time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
) -> Result<BucketSortOutput> { ) -> Result<BucketSortOutput> {
logger.initial_query(query); logger.initial_query(query);
logger.ranking_rules(&ranking_rules); logger.ranking_rules(&ranking_rules);
logger.initial_universe(universe); logger.initial_universe(universe);
let distinct_field = match distinct { let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
let distinct_fid = if let Some(field) = distinct_field {
ctx.index.fields_ids_map(ctx.txn)?.id(field) ctx.index.fields_ids_map(ctx.txn)?.id(field)
} else { } else {
None None
@ -171,19 +164,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
loop { loop {
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
ranking_rule_scores.push(ScoreDetails::Skipped); ranking_rule_scores.push(ScoreDetails::Skipped);
// remove candidates from the universe without adding them to result if their score is below the threshold
if let Some(ranking_score_threshold) = ranking_score_threshold {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
if current_score < ranking_score_threshold {
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
maybe_add_to_results!(bucket); maybe_add_to_results!(bucket);
ranking_rule_scores.pop(); ranking_rule_scores.pop();
if cur_ranking_rule_index == 0 { if cur_ranking_rule_index == 0 {
@ -239,18 +220,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
debug_assert!( debug_assert!(
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
); );
// remove candidates from the universe without adding them to result if their score is below the threshold
if let Some(ranking_score_threshold) = ranking_score_threshold {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
if current_score < ranking_score_threshold {
all_candidates -=
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
if cur_ranking_rule_index == ranking_rules_len - 1 if cur_ranking_rule_index == ranking_rules_len - 1

View File

@ -164,21 +164,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
} }
costs costs
} }
TermsMatchingStrategy::Frequency => {
let removal_order =
query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?;
let mut forbidden_nodes =
SmallBitmap::for_interned_values_in(&query_graph.nodes);
let mut costs = query_graph.nodes.map(|_| None);
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
for ns in removal_order {
for n in ns.iter() {
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
}
forbidden_nodes.union(&ns);
}
costs
}
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None), TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
} }
} else { } else {

View File

@ -22,7 +22,7 @@ pub enum SearchEvents {
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 }, RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 }, RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
RankingRuleEndIteration { ranking_rule_idx: usize }, RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
ExtendResults { new: Vec<u32> }, ExtendResults { new: Vec<u32> },
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> }, ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> }, ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
@ -123,9 +123,12 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
&mut self, &mut self,
ranking_rule_idx: usize, ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<QueryGraph>, _ranking_rule: &dyn RankingRule<QueryGraph>,
_universe: &RoaringBitmap, universe: &RoaringBitmap,
) { ) {
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx }); self.events.push(SearchEvents::RankingRuleEndIteration {
ranking_rule_idx,
universe_len: universe.len(),
});
self.location.pop(); self.location.pop();
} }
fn add_to_results(&mut self, docids: &[u32]) { fn add_to_results(&mut self, docids: &[u32]) {
@ -323,7 +326,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_skip_bucket(bucket_len)?; self.write_skip_bucket(bucket_len)?;
} }
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => { SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_end_iteration()?; self.write_end_iteration()?;
} }

View File

@ -516,7 +516,6 @@ mod tests {
false, false,
universe, universe,
&None, &None,
&None,
crate::search::new::GeoSortStrategy::default(), crate::search::new::GeoSortStrategy::default(),
0, 0,
100, 100,
@ -524,7 +523,6 @@ mod tests {
&mut crate::DefaultSearchLogger, &mut crate::DefaultSearchLogger,
&mut crate::DefaultSearchLogger, &mut crate::DefaultSearchLogger,
TimeBudget::max(), TimeBudget::max(),
None,
) )
.unwrap(); .unwrap();

View File

@ -197,11 +197,6 @@ fn resolve_maximally_reduced_query_graph(
.iter() .iter()
.flat_map(|x| x.iter()) .flat_map(|x| x.iter())
.collect(), .collect(),
TermsMatchingStrategy::Frequency => query_graph
.removal_order_for_terms_matching_strategy_frequency(ctx)?
.iter()
.flat_map(|x| x.iter())
.collect(),
TermsMatchingStrategy::All => vec![], TermsMatchingStrategy::All => vec![],
}; };
graph.remove_nodes_keep_edges(&nodes_to_remove); graph.remove_nodes_keep_edges(&nodes_to_remove);
@ -548,7 +543,6 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
Ok(()) Ok(())
} }
#[tracing::instrument(level = "trace", skip_all, target = "search")]
pub fn filtered_universe( pub fn filtered_universe(
index: &Index, index: &Index,
txn: &RoTxn<'_>, txn: &RoTxn<'_>,
@ -568,14 +562,12 @@ pub fn execute_vector_search(
scoring_strategy: ScoringStrategy, scoring_strategy: ScoringStrategy,
universe: RoaringBitmap, universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>, sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy, geo_strategy: geo_sort::Strategy,
from: usize, from: usize,
length: usize, length: usize,
embedder_name: &str, embedder_name: &str,
embedder: &Embedder, embedder: &Embedder,
time_budget: TimeBudget, time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
) -> Result<PartialSearchResult> { ) -> Result<PartialSearchResult> {
check_sort_criteria(ctx, sort_criteria.as_ref())?; check_sort_criteria(ctx, sort_criteria.as_ref())?;
@ -599,14 +591,12 @@ pub fn execute_vector_search(
ctx, ctx,
ranking_rules, ranking_rules,
&PlaceholderQuery, &PlaceholderQuery,
distinct.as_deref(),
&universe, &universe,
from, from,
length, length,
scoring_strategy, scoring_strategy,
placeholder_search_logger, placeholder_search_logger,
time_budget, time_budget,
ranking_score_threshold,
)?; )?;
Ok(PartialSearchResult { Ok(PartialSearchResult {
@ -629,7 +619,6 @@ pub fn execute_search(
exhaustive_number_hits: bool, exhaustive_number_hits: bool,
mut universe: RoaringBitmap, mut universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>, sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy, geo_strategy: geo_sort::Strategy,
from: usize, from: usize,
length: usize, length: usize,
@ -637,7 +626,6 @@ pub fn execute_search(
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>, placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
query_graph_logger: &mut dyn SearchLogger<QueryGraph>, query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
time_budget: TimeBudget, time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
) -> Result<PartialSearchResult> { ) -> Result<PartialSearchResult> {
check_sort_criteria(ctx, sort_criteria.as_ref())?; check_sort_criteria(ctx, sort_criteria.as_ref())?;
@ -720,14 +708,12 @@ pub fn execute_search(
ctx, ctx,
ranking_rules, ranking_rules,
&graph, &graph,
distinct.as_deref(),
&universe, &universe,
from, from,
length, length,
scoring_strategy, scoring_strategy,
query_graph_logger, query_graph_logger,
time_budget, time_budget,
ranking_score_threshold,
)? )?
} else { } else {
let ranking_rules = let ranking_rules =
@ -736,14 +722,12 @@ pub fn execute_search(
ctx, ctx,
ranking_rules, ranking_rules,
&PlaceholderQuery, &PlaceholderQuery,
distinct.as_deref(),
&universe, &universe,
from, from,
length, length,
scoring_strategy, scoring_strategy,
placeholder_search_logger, placeholder_search_logger,
time_budget, time_budget,
ranking_score_threshold,
)? )?
}; };
@ -753,12 +737,7 @@ pub fn execute_search(
// The candidates is the universe unless the exhaustive number of hits // The candidates is the universe unless the exhaustive number of hits
// is requested and a distinct attribute is set. // is requested and a distinct attribute is set.
if exhaustive_number_hits { if exhaustive_number_hits {
let distinct_field = match distinct.as_deref() { if let Some(f) = ctx.index.distinct_field(ctx.txn)? {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
if let Some(f) = distinct_field {
if let Some(distinct_fid) = fields_ids_map.id(f) { if let Some(distinct_fid) = fields_ids_map.id(f) {
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining; all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
} }

View File

@ -1,9 +1,8 @@
use std::cmp::{Ordering, Reverse}; use std::cmp::Ordering;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::hash::{Hash, Hasher}; use std::hash::{Hash, Hasher};
use fxhash::{FxHashMap, FxHasher}; use fxhash::{FxHashMap, FxHasher};
use roaring::RoaringBitmap;
use super::interner::{FixedSizeInterner, Interned}; use super::interner::{FixedSizeInterner, Interned};
use super::query_term::{ use super::query_term::{
@ -12,7 +11,6 @@ use super::query_term::{
use super::small_bitmap::SmallBitmap; use super::small_bitmap::SmallBitmap;
use super::SearchContext; use super::SearchContext;
use crate::search::new::interner::Interner; use crate::search::new::interner::Interner;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::Result; use crate::Result;
/// A node of the [`QueryGraph`]. /// A node of the [`QueryGraph`].
@ -292,49 +290,6 @@ impl QueryGraph {
} }
} }
pub fn removal_order_for_terms_matching_strategy_frequency(
&self,
ctx: &mut SearchContext,
) -> Result<Vec<SmallBitmap<QueryNode>>> {
// lookup frequency for each term
let mut term_with_frequency: Vec<(u8, u64)> = {
let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
for (_, node) in self.nodes.iter() {
match &node.data {
QueryNodeData::Term(t) => {
let docids = compute_query_term_subset_docids(ctx, &t.term_subset)?;
for id in t.term_ids.clone() {
term_docids
.entry(id)
.and_modify(|curr| *curr |= &docids)
.or_insert_with(|| docids.clone());
}
}
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
}
}
term_docids
.into_iter()
.map(|(idx, docids)| match docids.len() {
0 => (idx, u64::max_value()),
frequency => (idx, frequency),
})
.collect()
};
term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency));
let mut term_weight = BTreeMap::new();
let mut weight: u16 = 1;
let mut peekable = term_with_frequency.into_iter().peekable();
while let Some((idx, frequency)) = peekable.next() {
term_weight.insert(idx, weight);
if peekable.peek().map_or(false, |(_, f)| frequency != *f) {
weight += 1;
}
}
let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
}
pub fn removal_order_for_terms_matching_strategy_last( pub fn removal_order_for_terms_matching_strategy_last(
&self, &self,
ctx: &SearchContext, ctx: &SearchContext,
@ -360,19 +315,10 @@ impl QueryGraph {
if first_term_idx >= last_term_idx { if first_term_idx >= last_term_idx {
return vec![]; return vec![];
} }
let cost_of_term_idx = |term_idx: u8| { let cost_of_term_idx = |term_idx: u8| {
let rank = 1 + last_term_idx - term_idx; let rank = 1 + last_term_idx - term_idx;
rank as u16 rank as u16
}; };
self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
}
pub fn removal_order_for_terms_matching_strategy(
&self,
ctx: &SearchContext,
order: impl Fn(u8) -> u16,
) -> Vec<SmallBitmap<QueryNode>> {
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new(); let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
let mut at_least_one_mandatory_term = false; let mut at_least_one_mandatory_term = false;
for (node_id, node) in self.nodes.iter() { for (node_id, node) in self.nodes.iter() {
@ -383,7 +329,7 @@ impl QueryGraph {
} }
let mut cost = 0; let mut cost = 0;
for id in t.term_ids.clone() { for id in t.term_ids.clone() {
cost = std::cmp::max(cost, order(id)); cost = std::cmp::max(cost, cost_of_term_idx(id));
} }
nodes_to_remove nodes_to_remove
.entry(cost) .entry(cost)

View File

@ -205,18 +205,8 @@ fn create_index() -> TempIndex {
index index
} }
fn verify_distinct( fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec<String> {
index: &Index, let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids);
txn: &RoTxn,
distinct: Option<&str>,
docids: &[u32],
) -> Vec<String> {
let vs = collect_field_values(
index,
txn,
distinct.or_else(|| index.distinct_field(txn).unwrap()).unwrap(),
docids,
);
let mut unique = HashSet::new(); let mut unique = HashSet::new();
for v in vs.iter() { for v in vs.iter() {
@ -233,49 +223,12 @@ fn verify_distinct(
fn test_distinct_placeholder_no_ranking_rules() { fn test_distinct_placeholder_no_ranking_rules() {
let index = create_index(); let index = create_index();
// Set the letter as filterable and unset the distinct attribute.
index
.update_settings(|s| {
s.set_filterable_fields(hashset! { S("letter") });
s.reset_distinct_field();
})
.unwrap();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.distinct(S("letter"));
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, Some("letter"), &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"A\"",
"\"B\"",
"\"C\"",
"\"D\"",
"\"E\"",
"\"F\"",
"\"G\"",
"\"H\"",
"\"I\"",
"__does_not_exist__",
"__does_not_exist__",
"__does_not_exist__",
]
"###);
}
#[test]
fn test_distinct_at_search_placeholder_no_ranking_rules() {
let index = create_index();
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();
let s = Search::new(&txn, &index); let s = Search::new(&txn, &index);
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###" insta::assert_debug_snapshot!(distinct_values, @r###"
[ [
"\"A\"", "\"A\"",
@ -310,7 +263,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###" insta::assert_debug_snapshot!(distinct_values, @r###"
[ [
"\"E\"", "\"E\"",
@ -350,7 +303,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###" insta::assert_debug_snapshot!(distinct_values, @r###"
[ [
"\"I\"", "\"I\"",
@ -393,7 +346,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###" insta::assert_debug_snapshot!(distinct_values, @r###"
[ [
"\"I\"", "\"I\"",
@ -446,7 +399,7 @@ fn test_distinct_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###" insta::assert_debug_snapshot!(distinct_values, @r###"
[ [
"\"A\"", "\"A\"",
@ -500,7 +453,7 @@ fn test_distinct_sort_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###" insta::assert_debug_snapshot!(distinct_values, @r###"
[ [
"\"I\"", "\"I\"",
@ -596,7 +549,7 @@ fn test_distinct_typo() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###" insta::assert_debug_snapshot!(distinct_values, @r###"
[ [
"\"B\"", "\"B\"",

View File

@ -0,0 +1,244 @@
---
source: milli/src/search/new/tests/attribute_fid.rs
expression: "format!(\"{document_ids_scores:#?}\")"
---
[
(
2,
[
Fid(
Rank {
rank: 19,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
(
6,
[
Fid(
Rank {
rank: 15,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
5,
[
Fid(
Rank {
rank: 14,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
4,
[
Fid(
Rank {
rank: 13,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
3,
[
Fid(
Rank {
rank: 12,
max_rank: 19,
},
),
Position(
Rank {
rank: 83,
max_rank: 91,
},
),
],
),
(
9,
[
Fid(
Rank {
rank: 11,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
8,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
7,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 73,
max_rank: 91,
},
),
],
),
(
11,
[
Fid(
Rank {
rank: 7,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
10,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
13,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
12,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 78,
max_rank: 91,
},
),
],
),
(
14,
[
Fid(
Rank {
rank: 5,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
0,
[
Fid(
Rank {
rank: 1,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
]

View File

@ -13,7 +13,7 @@ use std::collections::BTreeSet;
use std::iter::FromIterator; use std::iter::FromIterator;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::{Search, SearchResult, TermsMatchingStrategy}; use crate::{db_snap, Search, SearchResult, TermsMatchingStrategy};
fn create_index() -> TempIndex { fn create_index() -> TempIndex {
let index = TempIndex::new(); let index = TempIndex::new();
@ -66,10 +66,9 @@ fn create_index() -> TempIndex {
} }
#[test] #[test]
#[cfg(not(feature = "swedish-recomposition"))]
fn test_stop_words_not_indexed() { fn test_stop_words_not_indexed() {
let index = create_index(); let index = create_index();
crate::db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264"); db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
} }
#[test] #[test]

View File

@ -17,7 +17,6 @@ pub struct Similar<'a> {
index: &'a Index, index: &'a Index,
embedder_name: String, embedder_name: String,
embedder: Arc<Embedder>, embedder: Arc<Embedder>,
ranking_score_threshold: Option<f64>,
} }
impl<'a> Similar<'a> { impl<'a> Similar<'a> {
@ -30,17 +29,7 @@ impl<'a> Similar<'a> {
embedder_name: String, embedder_name: String,
embedder: Arc<Embedder>, embedder: Arc<Embedder>,
) -> Self { ) -> Self {
Self { Self { id, filter: None, offset, limit, rtxn, index, embedder_name, embedder }
id,
filter: None,
offset,
limit,
rtxn,
index,
embedder_name,
embedder,
ranking_score_threshold: None,
}
} }
pub fn filter(&mut self, filter: Filter<'a>) -> &mut Self { pub fn filter(&mut self, filter: Filter<'a>) -> &mut Self {
@ -48,18 +37,8 @@ impl<'a> Similar<'a> {
self self
} }
pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Self {
self.ranking_score_threshold = Some(ranking_score_threshold);
self
}
pub fn execute(&self) -> Result<SearchResult> { pub fn execute(&self) -> Result<SearchResult> {
let mut universe = filtered_universe(self.index, self.rtxn, &self.filter)?; let universe = filtered_universe(self.index, self.rtxn, &self.filter)?;
// we never want to receive the docid
universe.remove(self.id);
let universe = universe;
let embedder_index = let embedder_index =
self.index self.index
@ -98,8 +77,6 @@ impl<'a> Similar<'a> {
let mut documents_seen = RoaringBitmap::new(); let mut documents_seen = RoaringBitmap::new();
documents_seen.insert(self.id); documents_seen.insert(self.id);
let mut candidates = universe;
for (docid, distance) in results for (docid, distance) in results
.into_iter() .into_iter()
// skip documents we've already seen & mark that we saw the current document // skip documents we've already seen & mark that we saw the current document
@ -108,6 +85,8 @@ impl<'a> Similar<'a> {
// take **after** filter and skip so that we get exactly limit elements if available // take **after** filter and skip so that we get exactly limit elements if available
.take(self.limit) .take(self.limit)
{ {
documents_ids.push(docid);
let score = 1.0 - distance; let score = 1.0 - distance;
let score = self let score = self
.embedder .embedder
@ -115,28 +94,14 @@ impl<'a> Similar<'a> {
.map(|distribution| distribution.shift(score)) .map(|distribution| distribution.shift(score))
.unwrap_or(score); .unwrap_or(score);
let score_details = let score = ScoreDetails::Vector(score_details::Vector { similarity: Some(score) });
vec![ScoreDetails::Vector(score_details::Vector { similarity: Some(score) })];
let score = ScoreDetails::global_score(score_details.iter()); document_scores.push(vec![score]);
if let Some(ranking_score_threshold) = &self.ranking_score_threshold {
if score < *ranking_score_threshold {
// this document is no longer a candidate
candidates.remove(docid);
// any document after this one is no longer a candidate either, so restrict the set to documents already seen.
candidates &= documents_seen;
break;
}
}
documents_ids.push(docid);
document_scores.push(score_details);
} }
Ok(SearchResult { Ok(SearchResult {
matching_words: Default::default(), matching_words: Default::default(),
candidates, candidates: universe,
documents_ids, documents_ids,
document_scores, document_scores,
degraded: false, degraded: false,

View File

@ -0,0 +1,7 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -0,0 +1,7 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -64,13 +64,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
// Remove all user-provided bits from the configs
let mut configs = self.index.embedding_configs(self.wtxn)?;
for config in configs.iter_mut() {
config.user_provided.clear();
}
self.index.put_embedding_configs(self.wtxn, configs)?;
// Clear the other databases. // Clear the other databases.
external_documents_ids.clear(self.wtxn)?; external_documents_ids.clear(self.wtxn)?;
word_docids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?;

View File

@ -40,26 +40,11 @@ pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
operation: DelAddOperation, operation: DelAddOperation,
buffer: &mut Vec<u8>, buffer: &mut Vec<u8>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
into_del_add_obkv_conditional_operation(reader, buffer, |_| operation)
}
/// Akin to the [into_del_add_obkv] function but lets you
/// conditionally define the `DelAdd` variant based on the obkv key.
pub fn into_del_add_obkv_conditional_operation<K, F>(
reader: obkv::KvReader<K>,
buffer: &mut Vec<u8>,
operation: F,
) -> std::io::Result<()>
where
K: obkv::Key + PartialOrd,
F: Fn(K) -> DelAddOperation,
{
let mut writer = obkv::KvWriter::new(buffer); let mut writer = obkv::KvWriter::new(buffer);
let mut value_buffer = Vec::new(); let mut value_buffer = Vec::new();
for (key, value) in reader.iter() { for (key, value) in reader.iter() {
value_buffer.clear(); value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
let operation = operation(key);
if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
value_writer.insert(DelAdd::Deletion, value)?; value_writer.insert(DelAdd::Deletion, value)?;
} }

View File

@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet}; use std::collections::BTreeMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io::{self, BufReader};
@ -9,7 +9,7 @@ use std::result::Result as StdResult;
use bytemuck::bytes_of; use bytemuck::bytes_of;
use grenad::Sorter; use grenad::Sorter;
use heed::BytesEncode; use heed::BytesEncode;
use itertools::{merge_join_by, EitherOrBoth}; use itertools::EitherOrBoth;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::{from_slice, Value}; use serde_json::{from_slice, Value};
@ -18,7 +18,7 @@ use FilterableValues::{Empty, Null, Values};
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
use crate::error::InternalError; use crate::error::InternalError;
use crate::facet::value_encoding::f64_into_bytes; use crate::facet::value_encoding::f64_into_bytes;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
@ -45,6 +45,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> { ) -> Result<ExtractedFacetValues> {
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
@ -75,181 +76,143 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let mut numbers_key_buffer = Vec::new(); let mut numbers_key_buffer = Vec::new();
let mut strings_key_buffer = Vec::new(); let mut strings_key_buffer = Vec::new();
let old_faceted_fids: BTreeSet<_> = let mut cursor = obkv_documents.into_cursor()?;
settings_diff.old.faceted_fields_ids.iter().copied().collect(); while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let new_faceted_fids: BTreeSet<_> = let obkv = obkv::KvReader::new(value);
settings_diff.new.faceted_fields_ids.iter().copied().collect();
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { for (field_id, field_bytes) in obkv.iter() {
let mut cursor = obkv_documents.into_cursor()?; let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id);
while let Some((docid_bytes, value)) = cursor.move_on_next()? { let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id);
let obkv = obkv::KvReader::new(value); if delete_faceted || add_faceted {
let get_document_json_value = move |field_id, side| { numbers_key_buffer.clear();
obkv.get(field_id) strings_key_buffer.clear();
.map(KvReaderDelAdd::new)
.and_then(|kv| kv.get(side))
.map(from_slice)
.transpose()
.map_err(InternalError::SerdeJson)
};
// iterate over the faceted fields instead of over the whole document.
for eob in
merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| {
old.cmp(new)
})
{
let (field_id, del_value, add_value) = match eob {
EitherOrBoth::Left(&field_id) => {
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
// deletion only // Set key to the field_id
(field_id, del_value, None) // Note: this encoding is consistent with FieldIdCodec
} numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
EitherOrBoth::Right(&field_id) => { strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
// addition only let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
(field_id, None, add_value) let document = DocumentId::from_be_bytes(document);
}
EitherOrBoth::Both(&field_id, _) => {
// during settings update, recompute the changing settings only.
if settings_diff.settings_update_only {
continue;
}
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; // For the other extraction tasks, prefix the key with the field_id and the document_id
let add_value = get_document_json_value(field_id, DelAdd::Addition)?; numbers_key_buffer.extend_from_slice(docid_bytes);
strings_key_buffer.extend_from_slice(docid_bytes);
(field_id, del_value, add_value) let del_add_obkv = obkv::KvReader::new(field_bytes);
} let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted)
{
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None,
};
let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) {
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None,
}; };
if del_value.is_some() || add_value.is_some() { // We insert the document id on the Del and the Add side if the field exists.
numbers_key_buffer.clear(); let (ref mut del_exists, ref mut add_exists) =
strings_key_buffer.clear(); facet_exists_docids.entry(field_id).or_default();
let (ref mut del_is_null, ref mut add_is_null) =
facet_is_null_docids.entry(field_id).or_default();
let (ref mut del_is_empty, ref mut add_is_empty) =
facet_is_empty_docids.entry(field_id).or_default();
// Set key to the field_id if del_value.is_some() {
// Note: this encoding is consistent with FieldIdCodec del_exists.insert(document);
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); }
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); if add_value.is_some() {
add_exists.insert(document);
}
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let geo_support =
let document = DocumentId::from_be_bytes(document); geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, geo_support));
let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, geo_support));
// For the other extraction tasks, prefix the key with the field_id and the document_id // Those closures are just here to simplify things a bit.
numbers_key_buffer.extend_from_slice(docid_bytes); let mut insert_numbers_diff = |del_numbers, add_numbers| {
strings_key_buffer.extend_from_slice(docid_bytes); insert_numbers_diff(
&mut fid_docid_facet_numbers_sorter,
&mut numbers_key_buffer,
del_numbers,
add_numbers,
)
};
let mut insert_strings_diff = |del_strings, add_strings| {
insert_strings_diff(
&mut fid_docid_facet_strings_sorter,
&mut strings_key_buffer,
del_strings,
add_strings,
)
};
// We insert the document id on the Del and the Add side if the field exists. match (del_filterable_values, add_filterable_values) {
let (ref mut del_exists, ref mut add_exists) = (None, None) => (),
facet_exists_docids.entry(field_id).or_default(); (Some(del_filterable_values), None) => match del_filterable_values {
let (ref mut del_is_null, ref mut add_is_null) = Null => {
facet_is_null_docids.entry(field_id).or_default(); del_is_null.insert(document);
let (ref mut del_is_empty, ref mut add_is_empty) = }
facet_is_empty_docids.entry(field_id).or_default(); Empty => {
del_is_empty.insert(document);
if del_value.is_some() { }
del_exists.insert(document); Values { numbers, strings } => {
} insert_numbers_diff(numbers, vec![])?;
if add_value.is_some() { insert_strings_diff(strings, vec![])?;
add_exists.insert(document); }
} },
(None, Some(add_filterable_values)) => match add_filterable_values {
let del_geo_support = settings_diff Null => {
.old add_is_null.insert(document);
.geo_fields_ids }
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); Empty => {
let add_geo_support = settings_diff add_is_empty.insert(document);
.new }
.geo_fields_ids Values { numbers, strings } => {
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); insert_numbers_diff(vec![], numbers)?;
let del_filterable_values = insert_strings_diff(vec![], strings)?;
del_value.map(|value| extract_facet_values(&value, del_geo_support)); }
let add_filterable_values = },
add_value.map(|value| extract_facet_values(&value, add_geo_support)); (Some(del_filterable_values), Some(add_filterable_values)) => {
match (del_filterable_values, add_filterable_values) {
// Those closures are just here to simplify things a bit. (Null, Null) | (Empty, Empty) => (),
let mut insert_numbers_diff = |del_numbers, add_numbers| { (Null, Empty) => {
insert_numbers_diff( del_is_null.insert(document);
&mut fid_docid_facet_numbers_sorter, add_is_empty.insert(document);
&mut numbers_key_buffer, }
del_numbers, (Empty, Null) => {
add_numbers, del_is_empty.insert(document);
) add_is_null.insert(document);
}; }
let mut insert_strings_diff = |del_strings, add_strings| { (Null, Values { numbers, strings }) => {
insert_strings_diff( insert_numbers_diff(vec![], numbers)?;
&mut fid_docid_facet_strings_sorter, insert_strings_diff(vec![], strings)?;
&mut strings_key_buffer,
del_strings,
add_strings,
)
};
match (del_filterable_values, add_filterable_values) {
(None, None) => (),
(Some(del_filterable_values), None) => match del_filterable_values {
Null => {
del_is_null.insert(document); del_is_null.insert(document);
} }
Empty => { (Empty, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_empty.insert(document); del_is_empty.insert(document);
} }
Values { numbers, strings } => { (Values { numbers, strings }, Null) => {
add_is_null.insert(document);
insert_numbers_diff(numbers, vec![])?; insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?; insert_strings_diff(strings, vec![])?;
} }
}, (Values { numbers, strings }, Empty) => {
(None, Some(add_filterable_values)) => match add_filterable_values {
Null => {
add_is_null.insert(document);
}
Empty => {
add_is_empty.insert(document); add_is_empty.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
} }
Values { numbers, strings } => { (
insert_numbers_diff(vec![], numbers)?; Values { numbers: del_numbers, strings: del_strings },
insert_strings_diff(vec![], strings)?; Values { numbers: add_numbers, strings: add_strings },
} ) => {
}, insert_numbers_diff(del_numbers, add_numbers)?;
(Some(del_filterable_values), Some(add_filterable_values)) => { insert_strings_diff(del_strings, add_strings)?;
match (del_filterable_values, add_filterable_values) {
(Null, Null) | (Empty, Empty) => (),
(Null, Empty) => {
del_is_null.insert(document);
add_is_empty.insert(document);
}
(Empty, Null) => {
del_is_empty.insert(document);
add_is_null.insert(document);
}
(Null, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_null.insert(document);
}
(Empty, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_empty.insert(document);
}
(Values { numbers, strings }, Null) => {
add_is_null.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(Values { numbers, strings }, Empty) => {
add_is_empty.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(
Values { numbers: del_numbers, strings: del_strings },
Values { numbers: add_numbers, strings: add_strings },
) => {
insert_numbers_diff(del_numbers, add_numbers)?;
insert_strings_diff(del_strings, add_strings)?;
}
} }
} }
} }

View File

@ -8,7 +8,6 @@ use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::GeoError; use crate::error::GeoError;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::extract_finite_float_from_value; use crate::update::index_documents::extract_finite_float_from_value;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::{FieldId, InternalError, Result}; use crate::{FieldId, InternalError, Result};
/// Extracts the geographical coordinates contained in each document under the `_geo` field. /// Extracts the geographical coordinates contained in each document under the `_geo` field.
@ -19,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
primary_key_id: FieldId, primary_key_id: FieldId,
settings_diff: &InnerIndexSettingsDiff, (lat_fid, lng_fid): (FieldId, FieldId),
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
let mut writer = create_writer( let mut writer = create_writer(
indexer.chunk_compression_type, indexer.chunk_compression_type,
@ -39,27 +38,47 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
serde_json::from_slice(document_id).unwrap() serde_json::from_slice(document_id).unwrap()
}; };
// extract old version // first we get the two fields
let del_lat_lng = match (obkv.get(lat_fid), obkv.get(lng_fid)) {
extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?; (Some(lat), Some(lng)) => {
// extract new version let deladd_lat_obkv = KvReaderDelAdd::new(lat);
let add_lat_lng = let deladd_lng_obkv = KvReaderDelAdd::new(lng);
extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
if del_lat_lng != add_lat_lng { // then we extract the values
let mut obkv = KvWriterDelAdd::memory(); let del_lat_lng = deladd_lat_obkv
if let Some([lat, lng]) = del_lat_lng { .get(DelAdd::Deletion)
#[allow(clippy::drop_non_drop)] .zip(deladd_lng_obkv.get(DelAdd::Deletion))
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
obkv.insert(DelAdd::Deletion, bytes)?; .transpose()?;
let add_lat_lng = deladd_lat_obkv
.get(DelAdd::Addition)
.zip(deladd_lng_obkv.get(DelAdd::Addition))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
if del_lat_lng != add_lat_lng {
let mut obkv = KvWriterDelAdd::memory();
if let Some([lat, lng]) = del_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Deletion, bytes)?;
}
if let Some([lat, lng]) = add_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Addition, bytes)?;
}
let bytes = obkv.into_inner()?;
writer.insert(docid_bytes, bytes)?;
}
} }
if let Some([lat, lng]) = add_lat_lng { (None, Some(_)) => {
#[allow(clippy::drop_non_drop)] return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Addition, bytes)?;
} }
let bytes = obkv.into_inner()?; (Some(_), None) => {
writer.insert(docid_bytes, bytes)?; return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => (),
} }
} }
@ -67,37 +86,16 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
} }
/// Extract the finite floats lat and lng from two bytes slices. /// Extract the finite floats lat and lng from two bytes slices.
fn extract_lat_lng( fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
document: &obkv::KvReader<FieldId>, let lat = extract_finite_float_from_value(
settings: &InnerIndexSettings, serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
deladd: DelAdd, )
document_id: impl Fn() -> Value, .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
) -> Result<Option<[f64; 2]>> {
match settings.geo_fields_ids {
Some((lat_fid, lng_fid)) => {
let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let (lat, lng) = match (lat, lng) {
(Some(lat), Some(lng)) => (lat, lng),
(Some(_), None) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
(None, Some(_)) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => return Ok(None),
};
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
let lng = extract_finite_float_from_value( let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
) )
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
Ok(Some([lat, lng]))
} Ok([lat, lng])
None => Ok(None),
}
} }

View File

@ -8,19 +8,18 @@ use std::sync::Arc;
use bytemuck::cast_slice; use bytemuck::cast_slice;
use grenad::Writer; use grenad::Writer;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::index::IndexEmbeddingConfig;
use crate::prompt::Prompt; use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::settings::{EmbedderAction, ReindexAction};
use crate::vector::Embedder; use crate::vector::Embedder;
use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort}; use crate::{DocumentId, Result, ThreadPoolNoAbort};
/// The length of the elements that are always in the buffer when inserting new values. /// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -36,8 +35,6 @@ pub struct ExtractedVectorPoints {
// embedder // embedder
pub embedder_name: String, pub embedder_name: String,
pub embedder: Arc<Embedder>, pub embedder: Arc<Embedder>,
pub add_to_user_provided: RoaringBitmap,
pub remove_from_user_provided: RoaringBitmap,
} }
enum VectorStateDelta { enum VectorStateDelta {
@ -45,7 +42,12 @@ enum VectorStateDelta {
// Remove all vectors, generated or manual, from this document // Remove all vectors, generated or manual, from this document
NowRemoved, NowRemoved,
NowManual(Vec<Vec<f32>>), // Add the manually specified vectors, passed in the other grenad
// Remove any previously generated vectors
// Note: changing the value of the manually specified vector **should not record** this delta
WasGeneratedNowManual(Vec<Vec<f32>>),
ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>),
// Add the vector computed from the specified prompt // Add the vector computed from the specified prompt
// Remove any previous vector // Remove any previous vector
@ -54,12 +56,14 @@ enum VectorStateDelta {
} }
impl VectorStateDelta { impl VectorStateDelta {
fn into_values(self) -> (bool, String, Vec<Vec<f32>>) { fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) {
match self { match self {
VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NoChange => Default::default(),
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
// We always delete the previous vectors VectorStateDelta::WasGeneratedNowManual(add) => {
VectorStateDelta::NowManual(add) => (true, Default::default(), add), (true, Default::default(), (Default::default(), add))
}
VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)),
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
} }
} }
@ -70,27 +74,12 @@ struct EmbedderVectorExtractor {
embedder: Arc<Embedder>, embedder: Arc<Embedder>,
prompt: Arc<Prompt>, prompt: Arc<Prompt>,
// (docid, _index) -> KvWriterDelAdd -> Vector
manual_vectors_writer: Writer<BufWriter<File>>,
// (docid) -> (prompt) // (docid) -> (prompt)
prompts_writer: Writer<BufWriter<File>>, prompts_writer: Writer<BufWriter<File>>,
// (docid) -> () // (docid) -> ()
remove_vectors_writer: Writer<BufWriter<File>>, remove_vectors_writer: Writer<BufWriter<File>>,
// (docid, _index) -> KvWriterDelAdd -> Vector
manual_vectors_writer: Writer<BufWriter<File>>,
// The docids of the documents that contains a user defined embedding
add_to_user_provided: RoaringBitmap,
action: ExtractionAction,
}
struct DocumentOperation {
// The docids of the documents that contains an auto-generated embedding
remove_from_user_provided: RoaringBitmap,
}
enum ExtractionAction {
SettingsFullReindex,
SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
DocumentOperation(DocumentOperation),
} }
/// Extracts the embedding vector contained in each document under the `_vectors` field. /// Extracts the embedding vector contained in each document under the `_vectors` field.
@ -100,7 +89,6 @@ enum ExtractionAction {
pub fn extract_vector_points<R: io::Read + io::Seek>( pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
embedders_configs: &[IndexEmbeddingConfig],
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
) -> Result<Vec<ExtractedVectorPoints>> { ) -> Result<Vec<ExtractedVectorPoints>> {
let reindex_vectors = settings_diff.reindex_vectors(); let reindex_vectors = settings_diff.reindex_vectors();
@ -109,207 +97,153 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let new_fields_ids_map = &settings_diff.new.fields_ids_map; let new_fields_ids_map = &settings_diff.new.fields_ids_map;
// the vector field id may have changed // the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
// filter the old vector fid if the settings has been changed forcing reindexing.
let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors);
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
let mut extractors = Vec::new(); let mut extractors = Vec::new();
for (embedder_name, (embedder, prompt)) in
settings_diff.new.embedding_configs.clone().into_iter()
{
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); // (docid) -> (prompt)
let old_configs = &settings_diff.old.embedding_configs; let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
if reindex_vectors { // (docid) -> ()
for (name, action) in settings_diff.embedding_config_updates.iter() { let remove_vectors_writer = create_writer(
match action { indexer.chunk_compression_type,
EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted indexer.chunk_compression_level,
EmbedderAction::Reindex(action) => { tempfile::tempfile()?,
let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name) );
else {
tracing::error!(embedder = name, "Requested embedder config not found");
continue;
};
// (docid, _index) -> KvWriterDelAdd -> Vector extractors.push(EmbedderVectorExtractor {
let manual_vectors_writer = create_writer( embedder_name,
indexer.chunk_compression_type, embedder,
indexer.chunk_compression_level, prompt,
tempfile::tempfile()?, manual_vectors_writer,
); prompts_writer,
remove_vectors_writer,
// (docid) -> (prompt) });
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let action = match action {
ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
ReindexAction::RegeneratePrompts => {
let Some((_, old_prompt)) = old_configs.get(name) else {
tracing::error!(embedder = name, "Old embedder config not found");
continue;
};
ExtractionAction::SettingsRegeneratePrompts { old_prompt }
}
};
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
prompts_writer,
remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided: RoaringBitmap::new(),
action,
});
}
}
}
} else {
// document operation
for (embedder_name, (embedder, prompt)) in configs.into_iter() {
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> (prompt)
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
prompts_writer,
remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided: RoaringBitmap::new(),
action: ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided: RoaringBitmap::new(),
}),
});
}
} }
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
// this must always be serialized as (docid, external_docid); // this must always be serialized as (docid, external_docid);
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
let (docid_bytes, external_id_bytes) = let (docid_bytes, external_id_bytes) =
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap(); try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
debug_assert!(from_utf8(external_id_bytes).is_ok()); debug_assert!(from_utf8(external_id_bytes).is_ok());
let docid = DocumentId::from_be_bytes(docid_bytes);
let obkv = obkv::KvReader::new(value); let obkv = obkv::KvReader::new(value);
key_buffer.clear(); key_buffer.clear();
key_buffer.extend_from_slice(docid_bytes.as_slice()); key_buffer.extend_from_slice(docid_bytes);
// since we only need the primary key when we throw an error we create this getter to // since we only need the primary key when we throw an error we create this getter to
// lazily get it when needed // lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
let mut parsed_vectors = ParsedVectorsDiff::new( let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid)
docid, .map_err(|error| error.to_crate_error(document_id().to_string()))?;
embedders_configs,
obkv,
old_vectors_fid,
new_vectors_fid,
)
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
for EmbedderVectorExtractor { for EmbedderVectorExtractor {
embedder_name, embedder_name,
embedder: _, embedder: _,
prompt, prompt,
manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided,
action,
} in extractors.iter_mut() } in extractors.iter_mut()
{ {
let (old, new) = parsed_vectors.remove(embedder_name); let delta = match parsed_vectors.remove(embedder_name) {
let delta = match action { (Some(old), Some(new)) => {
ExtractionAction::SettingsFullReindex => match old { // no autogeneration
// A full reindex can be triggered either by: let del_vectors = old.into_array_of_vectors();
// 1. a new embedder let add_vectors = new.into_array_of_vectors();
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
VectorState::Inline(vectors) => {
if !vectors.must_regenerate() {
add_to_user_provided.insert(docid);
}
match vectors.into_array_of_vectors() { if add_vectors.len() > usize::from(u8::MAX) {
Some(add_vectors) => { return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
if add_vectors.len() > usize::from(u8::MAX) { document_id().to_string(),
return Err(crate::Error::UserError( add_vectors.len(),
crate::UserError::TooManyVectors( )));
document_id().to_string(),
add_vectors.len(),
),
));
}
VectorStateDelta::NowManual(add_vectors)
}
None => VectorStateDelta::NoChange,
}
} }
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
VectorState::Manual => VectorStateDelta::NoChange, VectorStateDelta::ManualDelta(del_vectors, add_vectors)
// generated vectors must be regenerated }
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, (Some(_old), None) => {
}, // Do we keep this document?
// prompt regeneration is only triggered for existing embedders let document_is_kept = obkv
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { .iter()
if old.must_regenerate() { .map(|(_, deladd)| KvReaderDelAdd::new(deladd))
regenerate_if_prompt_changed( .any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv, obkv,
(old_prompt, prompt), DelAdd::Addition,
(old_fields_ids_map, new_fields_ids_map), new_fields_ids_map,
)? )?)
} else { } else {
// we can simply ignore user provided vectors as they are not regenerated and are VectorStateDelta::NowRemoved
// already in the DB since this is an existing embedder }
VectorStateDelta::NoChange }
(None, Some(new)) => {
// was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::WasGeneratedNowManual(add_vectors)
}
(None, None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(&prompt)
// TODO: this filter works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
.filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map)
.unwrap_or_default()
});
let new_prompt =
prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
}
} else {
VectorStateDelta::NowRemoved
} }
} }
ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided,
}) => extract_vector_document_diff(
docid,
obkv,
prompt,
(add_to_user_provided, remove_from_user_provided),
(old, new),
(old_fields_ids_map, new_fields_ids_map),
document_id,
)?,
}; };
// and we finally push the unique vectors into the writer // and we finally push the unique vectors into the writer
push_vectors_diff( push_vectors_diff(
remove_vectors_writer, remove_vectors_writer,
@ -317,6 +251,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
&mut key_buffer, &mut key_buffer,
delta, delta,
reindex_vectors,
)?; )?;
} }
} }
@ -327,185 +262,43 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
embedder_name, embedder_name,
embedder, embedder,
prompt: _, prompt: _,
manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
action,
manual_vectors_writer,
add_to_user_provided,
} in extractors } in extractors
{ {
let remove_from_user_provided =
if let ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided,
}) = action
{
remove_from_user_provided
} else {
Default::default()
};
results.push(ExtractedVectorPoints { results.push(ExtractedVectorPoints {
// docid, _index -> KvWriterDelAdd -> Vector
manual_vectors: writer_into_reader(manual_vectors_writer)?, manual_vectors: writer_into_reader(manual_vectors_writer)?,
// docid -> ()
remove_vectors: writer_into_reader(remove_vectors_writer)?, remove_vectors: writer_into_reader(remove_vectors_writer)?,
// docid -> prompt
prompts: writer_into_reader(prompts_writer)?, prompts: writer_into_reader(prompts_writer)?,
embedder, embedder,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
}) })
} }
Ok(results) Ok(results)
} }
fn extract_vector_document_diff( /// Computes the diff between both Del and Add numbers and
docid: DocumentId, /// only inserts the parts that differ in the sorter.
obkv: obkv::KvReader<'_, FieldId>,
prompt: &Prompt,
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
(old, new): (VectorState, VectorState),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
document_id: impl Fn() -> Value,
) -> Result<VectorStateDelta> {
match (old.must_regenerate(), new.must_regenerate()) {
(true, true) | (false, false) => {}
(true, false) => {
add_to_user_provided.insert(docid);
}
(false, true) => {
remove_from_user_provided.insert(docid);
}
}
let delta = match (old, new) {
// regardless of the previous state, if a document now contains inline _vectors, they must
// be extracted manually
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
Some(add_vectors) => {
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::NowManual(add_vectors)
}
None => VectorStateDelta::NoChange,
},
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
// document changed
(VectorState::Generated, VectorState::Generated) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(&prompt).map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
});
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
}
} else {
VectorStateDelta::NowRemoved
}
}
// inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
// the previous version of the document.
// Manual -> Generated is also not possible without an Inline to the right (which is handled above)
// Generated -> Generated is handled above, so not possible
// As a result, this code is unreachable
(_not_generated, VectorState::Generated) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
DelAdd::Addition,
new_fields_ids_map,
)?)
} else {
// make sure the document is always removed from user provided on removal
remove_from_user_provided.insert(docid);
VectorStateDelta::NowRemoved
}
}
// inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
// version of the document.
// however the Rust type system cannot know that.
(_manual, VectorState::Manual) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// if the new version of documents has the vectors in the DB,
// then they are user-provided and nothing possibly changed
VectorStateDelta::NoChange
} else {
// make sure the document is always removed from user provided on removal
remove_from_user_provided.insert(docid);
VectorStateDelta::NowRemoved
}
}
};
Ok(delta)
}
fn regenerate_if_prompt_changed(
obkv: obkv::KvReader<'_, FieldId>,
(old_prompt, new_prompt): (&Prompt, &Prompt),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
) -> Result<VectorStateDelta> {
let old_prompt =
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if new_prompt == old_prompt {
return Ok(VectorStateDelta::NoChange);
}
Ok(VectorStateDelta::NowGenerated(new_prompt))
}
fn regenerate_prompt(
obkv: obkv::KvReader<'_, FieldId>,
prompt: &Prompt,
new_fields_ids_map: &FieldsIdsMap,
) -> Result<VectorStateDelta> {
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
Ok(VectorStateDelta::NowGenerated(prompt))
}
/// We cannot compute the diff between both Del and Add vectors.
/// We'll push every vector and compute the difference later in TypedChunk.
fn push_vectors_diff( fn push_vectors_diff(
remove_vectors_writer: &mut Writer<BufWriter<File>>, remove_vectors_writer: &mut Writer<BufWriter<File>>,
prompts_writer: &mut Writer<BufWriter<File>>, prompts_writer: &mut Writer<BufWriter<File>>,
manual_vectors_writer: &mut Writer<BufWriter<File>>, manual_vectors_writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>, key_buffer: &mut Vec<u8>,
delta: VectorStateDelta, delta: VectorStateDelta,
reindex_vectors: bool,
) -> Result<()> { ) -> Result<()> {
let (must_remove, prompt, mut add_vectors) = delta.into_values(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
if must_remove { if must_remove
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
&& !reindex_vectors
{
key_buffer.truncate(TRUNCATE_SIZE); key_buffer.truncate(TRUNCATE_SIZE);
remove_vectors_writer.insert(&key_buffer, [])?; remove_vectors_writer.insert(&key_buffer, [])?;
} }
@ -515,22 +308,44 @@ fn push_vectors_diff(
} }
// We sort and dedup the vectors // We sort and dedup the vectors
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
let merged_vectors_iter =
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
// insert vectors into the writer // insert vectors into the writer
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
// Generate the key by extending the unique index to it. // Generate the key by extending the unique index to it.
key_buffer.truncate(TRUNCATE_SIZE); key_buffer.truncate(TRUNCATE_SIZE);
let index = u16::try_from(i).unwrap(); let index = u16::try_from(i).unwrap();
key_buffer.extend_from_slice(&index.to_be_bytes()); key_buffer.extend_from_slice(&index.to_be_bytes());
// We insert only the Add part of the Obkv to inform match eob {
// that we only want to remove all those vectors. EitherOrBoth::Both(_, _) => (), // no need to touch anything
let mut obkv = KvWriterDelAdd::memory(); EitherOrBoth::Left(vector) => {
obkv.insert(DelAdd::Addition, cast_slice(&vector))?; // TODO: the below condition works because we erase the vec database when a embedding setting changes.
let bytes = obkv.into_inner()?; // When vector pipeline will be optimized, this should be removed.
manual_vectors_writer.insert(&key_buffer, bytes)?; if !reindex_vectors {
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
} }
Ok(()) Ok(())

View File

@ -26,8 +26,11 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters, indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
// early return if the data shouldn't be deleted nor created. // early return if the data shouldn't be deleted nor created.
if settings_diff.settings_update_only && !settings_diff.reindex_proximities() { if !any_deletion && !any_addition {
let writer = create_writer( let writer = create_writer(
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
@ -36,10 +39,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
return writer_into_reader(writer); return writer_into_reader(writer);
} }
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
.map(|_| { .map(|_| {
create_sorter( create_sorter(

View File

@ -11,7 +11,7 @@ mod extract_word_position_docids;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
use std::sync::{Arc, OnceLock}; use std::sync::Arc;
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
use rayon::prelude::*; use rayon::prelude::*;
@ -30,9 +30,8 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids; use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk}; use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
/// Extract data for each databases from obkv documents in parallel. /// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender. /// Send data in grenad file over provided Sender.
@ -44,7 +43,7 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId, primary_key_id: FieldId,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>, geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<()> { ) -> Result<()> {
@ -57,7 +56,6 @@ pub(crate) fn data_from_obkv_documents(
original_documents_chunk, original_documents_chunk,
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
embedders_configs.clone(),
settings_diff.clone(), settings_diff.clone(),
) )
}) })
@ -72,6 +70,7 @@ pub(crate) fn data_from_obkv_documents(
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
primary_key_id, primary_key_id,
geo_fields_ids,
settings_diff.clone(), settings_diff.clone(),
max_positions_per_attributes, max_positions_per_attributes,
) )
@ -207,47 +206,33 @@ fn run_extraction_task<FE, FS, M>(
}) })
} }
fn request_threads() -> &'static ThreadPoolNoAbort {
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
REQUEST_THREADS.get_or_init(|| {
ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()
.unwrap()
})
}
/// Extract chunked data and send it into lmdb_writer_sx sender: /// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents /// - documents
fn send_original_documents_data( fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>, original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
) -> Result<()> { ) -> Result<()> {
let original_documents_chunk = let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
// no point in indexing vectors without embedders // no point in indexing vectors without embedders
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
if index_vectors { if index_vectors {
let settings_diff = settings_diff.clone(); let settings_diff = settings_diff.clone();
let embedders_configs = embedders_configs.clone();
let original_documents_chunk = original_documents_chunk.clone(); let original_documents_chunk = original_documents_chunk.clone();
let lmdb_writer_sx = lmdb_writer_sx.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || { rayon::spawn(move || {
match extract_vector_points( match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) {
original_documents_chunk.clone(),
indexer,
&embedders_configs,
&settings_diff,
) {
Ok(extracted_vectors) => { Ok(extracted_vectors) => {
for ExtractedVectorPoints { for ExtractedVectorPoints {
manual_vectors, manual_vectors,
@ -255,15 +240,13 @@ fn send_original_documents_data(
prompts, prompts,
embedder_name, embedder_name,
embedder, embedder,
add_to_user_provided,
remove_from_user_provided,
} in extracted_vectors } in extracted_vectors
{ {
let embeddings = match extract_embeddings( let embeddings = match extract_embeddings(
prompts, prompts,
indexer, indexer,
embedder.clone(), embedder.clone(),
request_threads(), &request_threads,
) { ) {
Ok(results) => Some(results), Ok(results) => Some(results),
Err(error) => { Err(error) => {
@ -281,8 +264,6 @@ fn send_original_documents_data(
expected_dimension: embedder.dimensions(), expected_dimension: embedder.dimensions(),
manual_vectors, manual_vectors,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
})); }));
} }
} }
@ -312,6 +293,7 @@ fn send_and_extract_flattened_documents_data(
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<( ) -> Result<(
@ -321,13 +303,12 @@ fn send_and_extract_flattened_documents_data(
let flattened_documents_chunk = let flattened_documents_chunk =
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
if settings_diff.run_geo_indexing() { if let Some(geo_fields_ids) = geo_fields_ids {
let documents_chunk_cloned = flattened_documents_chunk.clone(); let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let settings_diff = settings_diff.clone();
rayon::spawn(move || { rayon::spawn(move || {
let result = let result =
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff); extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids);
let _ = match result { let _ = match result {
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
Err(error) => lmdb_writer_sx_cloned.send(Err(error)), Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
@ -366,6 +347,7 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
indexer, indexer,
&settings_diff, &settings_diff,
geo_fields_ids,
)?; )?;
// send fid_docid_facet_numbers_chunk to DB writer // send fid_docid_facet_numbers_chunk to DB writer

View File

@ -286,7 +286,6 @@ where
settings_diff.new.recompute_searchables(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
let settings_diff = Arc::new(settings_diff); let settings_diff = Arc::new(settings_diff);
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
let backup_pool; let backup_pool;
let pool = match self.indexer_config.thread_pool { let pool = match self.indexer_config.thread_pool {
@ -316,6 +315,28 @@ where
// get the primary key field id // get the primary key field id
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap(); let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
// get the fid of the `_geo.lat` and `_geo.lng` fields.
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
// self.index.fields_ids_map($a)? ==>> field_id_map
let geo_fields_ids = match field_id_map.id("_geo") {
Some(gfid) => {
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = field_id_map
.insert("_geo.lat")
.zip(field_id_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
let pool_params = GrenadParameters { let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type, chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level, chunk_compression_level: self.indexer_config.chunk_compression_level,
@ -370,7 +391,6 @@ where
// Run extraction pipeline in parallel. // Run extraction pipeline in parallel.
pool.install(|| { pool.install(|| {
let settings_diff_cloned = settings_diff.clone();
rayon::spawn(move || { rayon::spawn(move || {
let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks"); let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks");
let _enter = child_span.enter(); let _enter = child_span.enter();
@ -400,8 +420,8 @@ where
pool_params, pool_params,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
primary_key_id, primary_key_id,
embedders_configs.clone(), geo_fields_ids,
settings_diff_cloned, settings_diff.clone(),
max_positions_per_attributes, max_positions_per_attributes,
) )
}); });
@ -428,7 +448,7 @@ where
Err(status) => { Err(status) => {
if let Some(typed_chunks) = chunk_accumulator.pop_longest() { if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
let (docids, is_merged_database) = let (docids, is_merged_database) =
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?; write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?;
if !docids.is_empty() { if !docids.is_empty() {
final_documents_ids |= docids; final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len(); let documents_seen_count = final_documents_ids.len();
@ -503,8 +523,6 @@ where
embeddings, embeddings,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
} => { } => {
dimension.insert(embedder_name.clone(), expected_dimension); dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints { TypedChunk::VectorPoints {
@ -513,8 +531,6 @@ where
expected_dimension, expected_dimension,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
} }
} }
otherwise => otherwise, otherwise => otherwise,
@ -547,11 +563,10 @@ where
pool.install(|| { pool.install(|| {
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) { for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
let writer = arroy::Writer::new(vector_arroy, k, dimension); let writer = arroy::Writer::new(vector_arroy, k, dimension);
if writer.need_build(wtxn)? { if writer.is_empty(wtxn)? {
writer.build(wtxn, &mut rng, None)?;
} else if writer.is_empty(wtxn)? {
break; break;
} }
writer.build(wtxn, &mut rng, None)?;
} }
Result::Ok(()) Result::Ok(())
}) })
@ -788,7 +803,6 @@ mod tests {
use super::*; use super::*;
use crate::documents::documents_batch_reader_from_objects; use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::index::IndexEmbeddingConfig;
use crate::search::TermsMatchingStrategy; use crate::search::TermsMatchingStrategy;
use crate::update::Setting; use crate::update::Setting;
use crate::{db_snap, Filter, Search}; use crate::{db_snap, Filter, Search};
@ -2624,12 +2638,10 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = let (embedder_name, embedder) = embedding_configs.pop().unwrap();
embedding_configs.pop().unwrap();
insta::assert_snapshot!(embedder_name, @"manual");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
let embedder = let embedder =
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
assert_eq!("manual", embedder_name);
let res = index let res = index
.search(&rtxn) .search(&rtxn)
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))

View File

@ -1,7 +1,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::btree_map::Entry as BEntry; use std::collections::btree_map::Entry as BEntry;
use std::collections::hash_map::Entry as HEntry; use std::collections::hash_map::Entry as HEntry;
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
@ -20,15 +20,10 @@ use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
use crate::error::{Error, InternalError, UserError}; use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::update::del_add::{ use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation,
KvReaderDelAdd,
};
use crate::update::index_documents::GrenadParameters; use crate::update::index_documents::GrenadParameters;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
use crate::vector::settings::{EmbedderAction, WriteBackToDocuments};
use crate::{ use crate::{
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
}; };
@ -53,6 +48,7 @@ pub struct Transform<'a, 'i> {
fields_ids_map: FieldsIdsMap, fields_ids_map: FieldsIdsMap,
indexer_settings: &'a IndexerConfig, indexer_settings: &'a IndexerConfig,
pub autogenerate_docids: bool,
pub index_documents_method: IndexDocumentsMethod, pub index_documents_method: IndexDocumentsMethod,
available_documents_ids: AvailableDocumentsIds, available_documents_ids: AvailableDocumentsIds,
@ -106,7 +102,7 @@ impl<'a, 'i> Transform<'a, 'i> {
index: &'i Index, index: &'i Index,
indexer_settings: &'a IndexerConfig, indexer_settings: &'a IndexerConfig,
index_documents_method: IndexDocumentsMethod, index_documents_method: IndexDocumentsMethod,
_autogenerate_docids: bool, autogenerate_docids: bool,
) -> Result<Self> { ) -> Result<Self> {
// We must choose the appropriate merge function for when two or more documents // We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch. // with the same user id must be merged or fully replaced in the same batch.
@ -140,6 +136,7 @@ impl<'a, 'i> Transform<'a, 'i> {
index, index,
fields_ids_map: index.fields_ids_map(wtxn)?, fields_ids_map: index.fields_ids_map(wtxn)?,
indexer_settings, indexer_settings,
autogenerate_docids,
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
original_sorter, original_sorter,
flattened_sorter, flattened_sorter,
@ -808,15 +805,13 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut new_inner_settings = old_inner_settings.clone(); let mut new_inner_settings = old_inner_settings.clone();
new_inner_settings.fields_ids_map = fields_ids_map; new_inner_settings.fields_ids_map = fields_ids_map;
let embedding_config_updates = Default::default(); let settings_diff = InnerIndexSettingsDiff {
let settings_update_only = false; old: old_inner_settings,
let settings_diff = InnerIndexSettingsDiff::new( new: new_inner_settings,
old_inner_settings,
new_inner_settings,
primary_key_id, primary_key_id,
embedding_config_updates, embedding_configs_updated: false,
settings_update_only, settings_update_only: false,
); };
Ok(TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
@ -835,19 +830,24 @@ impl<'a, 'i> Transform<'a, 'i> {
/// Rebind the field_ids of the provided document to their values /// Rebind the field_ids of the provided document to their values
/// based on the field_ids_maps difference between the old and the new settings, /// based on the field_ids_maps difference between the old and the new settings,
/// then fill the provided buffers with delta documents using KvWritterDelAdd. /// then fill the provided buffers with delta documents using KvWritterDelAdd.
#[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo
fn rebind_existing_document( fn rebind_existing_document(
old_obkv: KvReader<FieldId>, old_obkv: KvReader<FieldId>,
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
modified_faceted_fields: &HashSet<String>, modified_faceted_fields: &HashSet<String>,
mut injected_vectors: serde_json::Map<String, serde_json::Value>,
old_vectors_fid: Option<FieldId>,
original_obkv_buffer: Option<&mut Vec<u8>>, original_obkv_buffer: Option<&mut Vec<u8>>,
flattened_obkv_buffer: Option<&mut Vec<u8>>, flattened_obkv_buffer: Option<&mut Vec<u8>>,
) -> Result<()> { ) -> Result<()> {
// Always keep the primary key. // Always keep the primary key.
let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
// If only the `searchableAttributes` has been changed, keep only the searchable fields.
let must_reindex_searchables = settings_diff.reindex_searchable();
let necessary_searchable_field = |id: FieldId| -> bool {
must_reindex_searchables
&& (settings_diff.old.searchable_fields_ids.contains(&id)
|| settings_diff.new.searchable_fields_ids.contains(&id))
};
// If only a faceted field has been added, keep only this field. // If only a faceted field has been added, keep only this field.
let must_reindex_facets = settings_diff.reindex_facets(); let must_reindex_facets = settings_diff.reindex_facets();
let necessary_faceted_field = |id: FieldId| -> bool { let necessary_faceted_field = |id: FieldId| -> bool {
@ -862,68 +862,16 @@ impl<'a, 'i> Transform<'a, 'i> {
// we need the fields for the prompt/templating. // we need the fields for the prompt/templating.
let reindex_vectors = settings_diff.reindex_vectors(); let reindex_vectors = settings_diff.reindex_vectors();
// The operations that we must perform on the different fields.
let mut operations = HashMap::new();
let mut error_seen = false;
let mut obkv_writer = KvWriter::<_, FieldId>::memory(); let mut obkv_writer = KvWriter::<_, FieldId>::memory();
'write_fid: for (id, val) in old_obkv.iter() { for (id, val) in old_obkv.iter() {
if !injected_vectors.is_empty() { if is_primary_key(id)
'inject_vectors: { || necessary_searchable_field(id)
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; || necessary_faceted_field(id)
|| reindex_vectors
if id < vectors_fid { {
break 'inject_vectors;
}
let mut existing_vectors = if id == vectors_fid {
let existing_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
serde_json::Error,
> = serde_json::from_slice(val);
match existing_vectors {
Ok(existing_vectors) => existing_vectors,
Err(error) => {
if !error_seen {
tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map");
error_seen = true;
}
Default::default()
}
}
} else {
Default::default()
};
existing_vectors.append(&mut injected_vectors);
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
obkv_writer
.insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?;
if id == vectors_fid {
continue 'write_fid;
}
}
}
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
operations.insert(id, DelAddOperation::DeletionAndAddition);
obkv_writer.insert(id, val)?;
} else if let Some(operation) = settings_diff.reindex_searchable_id(id) {
operations.insert(id, operation);
obkv_writer.insert(id, val)?; obkv_writer.insert(id, val)?;
} }
} }
if !injected_vectors.is_empty() {
'inject_vectors: {
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors };
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?;
}
}
let data = obkv_writer.into_inner()?; let data = obkv_writer.into_inner()?;
let obkv = KvReader::<FieldId>::new(&data); let obkv = KvReader::<FieldId>::new(&data);
@ -939,9 +887,11 @@ impl<'a, 'i> Transform<'a, 'i> {
let flattened = flattened.as_deref().map_or(obkv, KvReader::new); let flattened = flattened.as_deref().map_or(obkv, KvReader::new);
flattened_obkv_buffer.clear(); flattened_obkv_buffer.clear();
into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| { into_del_add_obkv(
operations.get(&id).copied().unwrap_or(DelAddOperation::DeletionAndAddition) flattened,
})?; DelAddOperation::DeletionAndAddition,
flattened_obkv_buffer,
)?;
} }
Ok(()) Ok(())
@ -951,11 +901,6 @@ impl<'a, 'i> Transform<'a, 'i> {
/// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument.
/// ///
// TODO this can be done in parallel by using the rayon `ThreadPool`. // TODO this can be done in parallel by using the rayon `ThreadPool`.
#[tracing::instrument(
level = "trace"
skip(self, wtxn, settings_diff),
target = "indexing::documents"
)]
pub fn prepare_for_documents_reindexing( pub fn prepare_for_documents_reindexing(
self, self,
wtxn: &mut heed::RwTxn<'i>, wtxn: &mut heed::RwTxn<'i>,
@ -989,35 +934,6 @@ impl<'a, 'i> Transform<'a, 'i> {
None None
}; };
let readers: Result<
BTreeMap<&str, (Vec<arroy::Reader<arroy::distances::Angular>>, &RoaringBitmap)>,
> = settings_diff
.embedding_config_updates
.iter()
.filter_map(|(name, action)| {
if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}) = action
{
let readers: Result<Vec<_>> =
self.index.arroy_readers(wtxn, *embedder_id).collect();
match readers {
Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))),
Err(error) => Some(Err(error)),
}
} else {
None
}
})
.collect();
let readers = readers?;
let old_vectors_fid = settings_diff
.old
.fields_ids_map
.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
// We initialize the sorter with the user indexing settings. // We initialize the sorter with the user indexing settings.
let mut flattened_sorter = let mut flattened_sorter =
if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
@ -1044,50 +960,10 @@ impl<'a, 'i> Transform<'a, 'i> {
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?; )?;
let injected_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
arroy::Error,
> = readers
.iter()
.filter_map(|(name, (readers, user_provided))| {
if !user_provided.contains(docid) {
return None;
}
let mut vectors = Vec::new();
for reader in readers {
let Some(vector) = reader.item_vector(wtxn, docid).transpose() else {
break;
};
match vector {
Ok(vector) => vectors.push(vector),
Err(error) => return Some(Err(error)),
}
}
if vectors.is_empty() {
return None;
}
Some(Ok((
name.to_string(),
serde_json::to_value(ExplicitVectors {
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
vectors,
)),
regenerate: false,
})
.unwrap(),
)))
})
.collect();
let injected_vectors = injected_vectors?;
Self::rebind_existing_document( Self::rebind_existing_document(
old_obkv, old_obkv,
&settings_diff, &settings_diff,
&modified_faceted_fields, &modified_faceted_fields,
injected_vectors,
old_vectors_fid,
Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()),
)?; )?;
@ -1104,23 +980,6 @@ impl<'a, 'i> Transform<'a, 'i> {
} }
} }
let mut writers = Vec::new();
// delete all vectors from the embedders that need removal
for (_, (readers, _)) in readers {
for reader in readers {
let dimensions = reader.dimensions();
let arroy_index = reader.index();
drop(reader);
let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions);
writers.push(writer);
}
}
for writer in writers {
writer.clear(wtxn)?;
}
let grenad_params = GrenadParameters { let grenad_params = GrenadParameters {
chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_type: self.indexer_settings.chunk_compression_type,
chunk_compression_level: self.indexer_settings.chunk_compression_level, chunk_compression_level: self.indexer_settings.chunk_compression_level,

View File

@ -7,7 +7,7 @@ use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script}; use charabia::{Language, Script};
use grenad::{Merger, MergerBuilder}; use grenad::{Merger, MergerBuilder};
use heed::types::Bytes; use heed::types::Bytes;
use heed::{BytesDecode, RwTxn}; use heed::RwTxn;
use obkv::{KvReader, KvWriter}; use obkv::{KvReader, KvWriter};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -20,17 +20,13 @@ use super::MergeFn;
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::index::db_name::DOCUMENTS; use crate::index::db_name::DOCUMENTS;
use crate::index::IndexEmbeddingConfig;
use crate::proximity::MAX_DISTANCE;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate; use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::{ use crate::update::index_documents::helpers::{
as_cloneable_grenad, keep_latest_obkv, try_split_array_at, as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
}; };
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{ use crate::{
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError,
Result, SerializationError, U8StrStrCodec,
}; };
/// This struct accumulates and group the TypedChunks /// This struct accumulates and group the TypedChunks
@ -91,8 +87,6 @@ pub(crate) enum TypedChunk {
expected_dimension: usize, expected_dimension: usize,
manual_vectors: grenad::Reader<BufReader<File>>, manual_vectors: grenad::Reader<BufReader<File>>,
embedder_name: String, embedder_name: String,
add_to_user_provided: RoaringBitmap,
remove_from_user_provided: RoaringBitmap,
}, },
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
} }
@ -128,10 +122,9 @@ impl TypedChunk {
/// Return new documents seen. /// Return new documents seen.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
pub(crate) fn write_typed_chunk_into_index( pub(crate) fn write_typed_chunk_into_index(
wtxn: &mut RwTxn,
index: &Index,
settings_diff: &InnerIndexSettingsDiff,
typed_chunks: Vec<TypedChunk>, typed_chunks: Vec<TypedChunk>,
index: &Index,
wtxn: &mut RwTxn,
) -> Result<(RoaringBitmap, bool)> { ) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false; let mut is_merged_database = false;
match typed_chunks[0] { match typed_chunks[0] {
@ -157,11 +150,8 @@ pub(crate) fn write_typed_chunk_into_index(
let mut docids = index.documents_ids(wtxn)?; let mut docids = index.documents_ids(wtxn)?;
let mut iter = merger.into_stream_merger_iter()?; let mut iter = merger.into_stream_merger_iter()?;
let embedders: BTreeSet<_> = index let embedders: BTreeSet<_> =
.embedding_configs(wtxn)? index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
.into_iter()
.map(|IndexEmbeddingConfig { name, .. }| name)
.collect();
let mut vectors_buffer = Vec::new(); let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? { while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -187,7 +177,7 @@ pub(crate) fn write_typed_chunk_into_index(
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
break 'vectors Some(addition); break 'vectors Some(addition);
}; };
vectors.retain_not_embedded_vectors(&embedders); vectors.retain_user_provided_vectors(&embedders);
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
if vectors.is_empty() { if vectors.is_empty() {
// skip writing empty `_vectors` map // skip writing empty `_vectors` map
@ -495,22 +485,13 @@ pub(crate) fn write_typed_chunk_into_index(
} }
let merger = builder.build(); let merger = builder.build();
if settings_diff.only_additional_fields.is_some() { write_entries_into_database(
write_proximity_entries_into_database_additional_searchables( merger,
merger, &index.word_pair_proximity_docids,
&index.word_pair_proximity_docids, wtxn,
wtxn, deladd_serialize_add_side,
)?; merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
} else { )?;
write_entries_into_database(
merger,
&index.word_pair_proximity_docids,
wtxn,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
}
is_merged_database = true; is_merged_database = true;
} }
TypedChunk::FieldIdDocidFacetNumbers(_) => { TypedChunk::FieldIdDocidFacetNumbers(_) => {
@ -625,8 +606,6 @@ pub(crate) fn write_typed_chunk_into_index(
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
let mut add_to_user_provided = RoaringBitmap::new();
let mut remove_from_user_provided = RoaringBitmap::new();
let mut params = None; let mut params = None;
for typed_chunk in typed_chunks { for typed_chunk in typed_chunks {
let TypedChunk::VectorPoints { let TypedChunk::VectorPoints {
@ -635,8 +614,6 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings, embeddings,
expected_dimension, expected_dimension,
embedder_name, embedder_name,
add_to_user_provided: aud,
remove_from_user_provided: rud,
} = typed_chunk } = typed_chunk
else { else {
unreachable!(); unreachable!();
@ -649,23 +626,11 @@ pub(crate) fn write_typed_chunk_into_index(
if let Some(embeddings) = embeddings { if let Some(embeddings) = embeddings {
embeddings_builder.push(embeddings.into_cursor()?); embeddings_builder.push(embeddings.into_cursor()?);
} }
add_to_user_provided |= aud;
remove_from_user_provided |= rud;
} }
// typed chunks has always at least 1 chunk. // typed chunks has always at least 1 chunk.
let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
let mut embedding_configs = index.embedding_configs(wtxn)?;
let index_embedder_config = embedding_configs
.iter_mut()
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
.unwrap();
index_embedder_config.user_provided -= remove_from_user_provided;
index_embedder_config.user_provided |= add_to_user_provided;
index.put_embedding_configs(wtxn, embedding_configs)?;
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
)?; )?;
@ -865,51 +830,3 @@ where
} }
Ok(()) Ok(())
} }
/// Akin to the `write_entries_into_database` function but specialized
/// for the case when we only index additional searchable fields only.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
fn write_proximity_entries_into_database_additional_searchables<R>(
merger: Merger<R, MergeFn>,
database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
wtxn: &mut RwTxn,
) -> Result<()>
where
R: io::Read + io::Seek,
{
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
if valid_lmdb_key(key) {
let (proximity_to_insert, word1, word2) =
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) {
Some(value) => {
CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
}
None => continue,
};
let mut data_to_remove = RoaringBitmap::new();
for prox in 1..(MAX_DISTANCE as u8) {
let key = (prox, word1, word2);
let database_value = database.get(wtxn, &key)?.unwrap_or_default();
let value = if prox == proximity_to_insert {
// Proximity that should be changed.
// Union values and remove lower proximity data
(&database_value | &data_to_insert) - &data_to_remove
} else {
// Remove lower proximity data
&database_value - &data_to_remove
};
// add the current data in data_to_remove for the next proximities
data_to_remove |= &value;
if database_value != value {
database.put(wtxn, &key, &value)?;
}
}
}
}
Ok(())
}

View File

@ -6,27 +6,19 @@ use std::sync::Arc;
use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use charabia::{Normalize, Tokenizer, TokenizerBuilder};
use deserr::{DeserializeError, Deserr}; use deserr::{DeserializeError, Deserr};
use itertools::{EitherOrBoth, Itertools}; use itertools::{EitherOrBoth, Itertools};
use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime; use time::OffsetDateTime;
use super::del_add::DelAddOperation;
use super::index_documents::{IndexDocumentsConfig, Transform}; use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig; use super::IndexerConfig;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::error::UserError; use crate::error::UserError;
use crate::index::{ use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
};
use crate::order_by_map::OrderByMap; use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod; use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
use crate::vector::settings::{
check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction,
WriteBackToDocuments,
};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldId, FieldsIdsMap, Index, Result}; use crate::{FieldId, FieldsIdsMap, Index, Result};
@ -497,7 +489,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.index.put_all_searchable_fields_from_fields_ids_map( self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn, self.wtxn,
&names, &names,
&fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME),
&fields_ids_map, &fields_ids_map,
)?; )?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
@ -927,177 +918,92 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(changed) Ok(changed)
} }
fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> { fn update_embedding_configs(&mut self) -> Result<bool> {
match std::mem::take(&mut self.embedder_settings) { let update = match std::mem::take(&mut self.embedder_settings) {
Setting::Set(configs) => self.update_embedding_configs_set(configs), Setting::Set(configs) => {
Setting::Reset => { let mut changed = false;
// all vectors should be written back to documents
let old_configs = self.index.embedding_configs(self.wtxn)?; let old_configs = self.index.embedding_configs(self.wtxn)?;
let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> =
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
let mut new_configs = BTreeMap::new();
for joined in old_configs
.into_iter() .into_iter()
.map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> { .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
let embedder_id = {
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( match joined {
crate::InternalError::DatabaseMissingEntry { // updated config
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, EitherOrBoth::Both((name, mut old), (_, new)) => {
key: None, changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
}, if changed {
)?; tracing::debug!(embedder = name, "need reindex");
Ok(( } else {
name, tracing::debug!(embedder = name, "skip reindex");
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { }
embedder_id, let new = validate_embedding_settings(old, &name)?;
user_provided, new_configs.insert(name, new);
}), }
)) // unchanged config
EitherOrBoth::Left((name, setting)) => {
new_configs.insert(name, setting);
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(
&mut setting,
);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
changed = true;
new_configs.insert(name, setting);
}
}
}
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
.into_iter()
.filter_map(|(name, setting)| match setting {
Setting::Set(value) => Some((name, value.into())),
Setting::Reset => None,
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
}) })
.collect(); .collect();
let remove_all = remove_all?;
self.index.embedder_category_id.clear(self.wtxn)?; self.index.embedder_category_id.clear(self.wtxn)?;
for (index, (embedder_name, _)) in new_configs.iter().enumerate() {
self.index.embedder_category_id.put_with_flags(
self.wtxn,
heed::PutFlags::APPEND,
embedder_name,
&index
.try_into()
.map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?,
)?;
}
if new_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, new_configs)?;
}
changed
}
Setting::Reset => {
self.index.delete_embedding_configs(self.wtxn)?; self.index.delete_embedding_configs(self.wtxn)?;
Ok(remove_all) true
} }
Setting::NotSet => Ok(Default::default()), Setting::NotSet => false,
} };
}
fn update_embedding_configs_set( // if any changes force a reindexing
&mut self, // clear the vector database.
configs: BTreeMap<String, Setting<EmbeddingSettings>>, if update {
) -> Result<BTreeMap<String, EmbedderAction>> { self.index.vector_arroy.clear(self.wtxn)?;
use crate::vector::settings::SettingsDiff; }
let old_configs = self.index.embedding_configs(self.wtxn)?; Ok(update)
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
.into_iter()
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
(name, (config.into(), user_provided))
})
.collect();
let mut updated_configs = BTreeMap::new();
let mut embedder_actions = BTreeMap::new();
for joined in old_configs
.into_iter()
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
{
match joined {
// updated config
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
let settings_diff = SettingsDiff::from_settings(old, new);
match settings_diff {
SettingsDiff::Remove => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"removing embedder"
);
let embedder_id =
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
crate::InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
// free id immediately
self.index.embedder_category_id.delete(self.wtxn, &name)?;
embedder_actions.insert(
name,
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}),
);
}
SettingsDiff::Reindex { action, updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
?action,
"reindex embedder"
);
embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action));
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
SettingsDiff::UpdateWithoutReindex { updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"update without reindex embedder"
);
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
}
}
// unchanged config
EitherOrBoth::Left((name, (setting, user_provided))) => {
tracing::debug!(embedder = name, "unchanged embedder");
updated_configs.insert(name, (Setting::Set(setting), user_provided));
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
tracing::debug!(embedder = name, "new embedder");
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
embedder_actions
.insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex));
updated_configs.insert(name, (setting, RoaringBitmap::new()));
}
}
}
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
for res in self.index.embedder_category_id.iter(self.wtxn)? {
let (_name, id) = res?;
free_indices[id as usize] = false;
}
let mut free_indices = free_indices.iter_mut().enumerate();
let mut find_free_index =
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
for (name, action) in embedder_actions.iter() {
match action {
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => {
/* cannot be a new embedder, so has to have an id already */
}
EmbedderAction::Reindex(ReindexAction::FullReindex) => {
if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() {
let id = find_free_index()
.ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
}
}
EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ }
}
}
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
.into_iter()
.filter_map(|(name, (config, user_provided))| match config {
Setting::Set(config) => {
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
}
Setting::Reset => None,
Setting::NotSet => Some(IndexEmbeddingConfig {
name,
config: EmbeddingSettings::default().into(),
user_provided,
}),
})
.collect();
if updated_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
}
Ok(embedder_actions)
} }
fn update_search_cutoff(&mut self) -> Result<bool> { fn update_search_cutoff(&mut self) -> Result<bool> {
@ -1151,8 +1057,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_searchable()?; self.update_searchable()?;
self.update_exact_attributes()?; self.update_exact_attributes()?;
self.update_proximity_precision()?; self.update_proximity_precision()?;
// TODO: very rough approximation of the needs for reindexing where any change will result in
let embedding_config_updates = self.update_embedding_configs()?; // a full reindexing.
// What can be done instead:
// 1. Only change the distance on a distance change
// 2. Only change the name -> embedder mapping on a name change
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
let embedding_configs_updated = self.update_embedding_configs()?;
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
new_inner_settings.recompute_facets(self.wtxn, self.index)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?;
@ -1161,14 +1072,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
.index .index
.primary_key(self.wtxn)? .primary_key(self.wtxn)?
.and_then(|name| new_inner_settings.fields_ids_map.id(name)); .and_then(|name| new_inner_settings.fields_ids_map.id(name));
let settings_update_only = true; let inner_settings_diff = InnerIndexSettingsDiff {
let inner_settings_diff = InnerIndexSettingsDiff::new( old: old_inner_settings,
old_inner_settings, new: new_inner_settings,
new_inner_settings,
primary_key_id, primary_key_id,
embedding_config_updates, embedding_configs_updated,
settings_update_only, settings_update_only: true,
); };
if inner_settings_diff.any_reindexing_needed() { if inner_settings_diff.any_reindexing_needed() {
self.reindex(&progress_callback, &should_abort, inner_settings_diff)?; self.reindex(&progress_callback, &should_abort, inner_settings_diff)?;
@ -1182,106 +1092,24 @@ pub struct InnerIndexSettingsDiff {
pub(crate) old: InnerIndexSettings, pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings, pub(crate) new: InnerIndexSettings,
pub(crate) primary_key_id: Option<FieldId>, pub(crate) primary_key_id: Option<FieldId>,
pub(crate) embedding_config_updates: BTreeMap<String, EmbedderAction>, // TODO: compare directly the embedders.
pub(crate) embedding_configs_updated: bool,
pub(crate) settings_update_only: bool, pub(crate) settings_update_only: bool,
/// The set of only the additional searchable fields.
/// If any other searchable field has been modified, is set to None.
pub(crate) only_additional_fields: Option<HashSet<String>>,
// Cache the check to see if all the stop_words, allowed_separators, dictionary,
// exact_attributes, proximity_precision are different.
pub(crate) cache_reindex_searchable_without_user_defined: bool,
// Cache the check to see if the user_defined_searchables are different.
pub(crate) cache_user_defined_searchables: bool,
// Cache the check to see if the exact_attributes are different.
pub(crate) cache_exact_attributes: bool,
} }
impl InnerIndexSettingsDiff { impl InnerIndexSettingsDiff {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::settings")]
pub(crate) fn new(
old_settings: InnerIndexSettings,
new_settings: InnerIndexSettings,
primary_key_id: Option<FieldId>,
embedding_config_updates: BTreeMap<String, EmbedderAction>,
settings_update_only: bool,
) -> Self {
let only_additional_fields = match (
&old_settings.user_defined_searchable_fields,
&new_settings.user_defined_searchable_fields,
) {
(None, None) | (Some(_), None) | (None, Some(_)) => None, // None means *
(Some(old), Some(new)) => {
let old: HashSet<_> = old.iter().cloned().collect();
let new: HashSet<_> = new.iter().cloned().collect();
if old.difference(&new).next().is_none() {
// if no field has been removed return only the additional ones
Some(&new - &old).filter(|x| !x.is_empty())
} else {
None
}
}
};
let cache_reindex_searchable_without_user_defined = {
old_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
!= new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|| old_settings.allowed_separators != new_settings.allowed_separators
|| old_settings.dictionary != new_settings.dictionary
|| old_settings.proximity_precision != new_settings.proximity_precision
};
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
let cache_user_defined_searchables = old_settings.user_defined_searchable_fields
!= new_settings.user_defined_searchable_fields;
InnerIndexSettingsDiff {
old: old_settings,
new: new_settings,
primary_key_id,
embedding_config_updates,
settings_update_only,
only_additional_fields,
cache_reindex_searchable_without_user_defined,
cache_user_defined_searchables,
cache_exact_attributes,
}
}
pub fn any_reindexing_needed(&self) -> bool { pub fn any_reindexing_needed(&self) -> bool {
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
} }
pub fn reindex_searchable(&self) -> bool { pub fn reindex_searchable(&self) -> bool {
self.cache_reindex_searchable_without_user_defined self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|| self.cache_exact_attributes != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|| self.cache_user_defined_searchables || self.old.allowed_separators != self.new.allowed_separators
} || self.old.dictionary != self.new.dictionary
|| self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields
pub fn reindex_proximities(&self) -> bool { || self.old.exact_attributes != self.new.exact_attributes
// if any searchable settings force the reindexing || self.old.proximity_precision != self.new.proximity_precision
(self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables)
// and if any settings needs the proximity database created
&& (self.old.proximity_precision == ProximityPrecision::ByAttribute
|| self.new.proximity_precision == ProximityPrecision::ByAttribute)
}
pub fn reindex_searchable_id(&self, id: FieldId) -> Option<DelAddOperation> {
if self.cache_reindex_searchable_without_user_defined || self.cache_exact_attributes {
Some(DelAddOperation::DeletionAndAddition)
} else if let Some(only_additional_fields) = &self.only_additional_fields {
let additional_field = self.new.fields_ids_map.name(id).unwrap();
if only_additional_fields.contains(additional_field) {
Some(DelAddOperation::Addition)
} else {
None
}
} else if self.cache_user_defined_searchables {
Some(DelAddOperation::DeletionAndAddition)
} else {
None
}
} }
pub fn reindex_facets(&self) -> bool { pub fn reindex_facets(&self) -> bool {
@ -1307,18 +1135,13 @@ impl InnerIndexSettingsDiff {
} }
pub fn reindex_vectors(&self) -> bool { pub fn reindex_vectors(&self) -> bool {
!self.embedding_config_updates.is_empty() self.embedding_configs_updated
} }
pub fn settings_update_only(&self) -> bool { pub fn settings_update_only(&self) -> bool {
self.settings_update_only self.settings_update_only
} }
pub fn run_geo_indexing(&self) -> bool {
self.old.geo_fields_ids != self.new.geo_fields_ids
|| (!self.settings_update_only && self.new.geo_fields_ids.is_some())
}
pub fn modified_faceted_fields(&self) -> HashSet<String> { pub fn modified_faceted_fields(&self) -> HashSet<String> {
&self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields
} }
@ -1338,9 +1161,6 @@ pub(crate) struct InnerIndexSettings {
pub proximity_precision: ProximityPrecision, pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs, pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>, pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
pub non_searchable_fields_ids: Vec<FieldId>,
pub non_faceted_fields_ids: Vec<FieldId>,
} }
impl InnerIndexSettings { impl InnerIndexSettings {
@ -1349,13 +1169,13 @@ impl InnerIndexSettings {
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let allowed_separators = index.allowed_separators(rtxn)?; let allowed_separators = index.allowed_separators(rtxn)?;
let dictionary = index.dictionary(rtxn)?; let dictionary = index.dictionary(rtxn)?;
let mut fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
let user_defined_searchable_fields = let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; let searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
let exact_attributes = index.exact_attributes_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?;
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
@ -1364,28 +1184,6 @@ impl InnerIndexSettings {
.into_iter() .into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field)) .filter_map(|(field, count)| (count != 0).then_some(field))
.collect(); .collect();
// index.fields_ids_map($a)? ==>> fields_ids_map
let geo_fields_ids = match fields_ids_map.id("_geo") {
Some(gfid) => {
let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = fields_ids_map
.insert("_geo.lat")
.zip(fields_ids_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME);
searchable_fields_ids.retain(|id| !vectors_fids.contains(id));
faceted_fields_ids.retain(|id| !vectors_fids.contains(id));
Ok(Self { Ok(Self {
stop_words, stop_words,
@ -1400,9 +1198,6 @@ impl InnerIndexSettings {
proximity_precision, proximity_precision,
embedding_configs, embedding_configs,
existing_fields, existing_fields,
geo_fields_ids,
non_searchable_fields_ids: vectors_fids.clone(),
non_faceted_fields_ids: vectors_fids.clone(),
}) })
} }
@ -1410,10 +1205,9 @@ impl InnerIndexSettings {
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
let new_facets = self let new_facets = self
.fields_ids_map .fields_ids_map
.iter() .names()
.filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields))
.filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) .map(|field| field.to_string())
.map(|(_fid, field)| field.to_string())
.collect(); .collect();
index.put_faceted_fields(wtxn, &new_facets)?; index.put_faceted_fields(wtxn, &new_facets)?;
@ -1433,7 +1227,6 @@ impl InnerIndexSettings {
index.put_all_searchable_fields_from_fields_ids_map( index.put_all_searchable_fields_from_fields_ids_map(
wtxn, wtxn,
&searchable_fields, &searchable_fields,
&self.non_searchable_fields_ids,
&self.fields_ids_map, &self.fields_ids_map,
)?; )?;
} }
@ -1444,25 +1237,19 @@ impl InnerIndexSettings {
} }
} }
fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<EmbeddingConfigs> { fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs let res: Result<_> = embedding_configs
.into_iter() .into_iter()
.map( .map(|(name, EmbeddingConfig { embedder_options, prompt })| {
|IndexEmbeddingConfig { let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
name,
config: EmbeddingConfig { embedder_options, prompt },
..
}| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new( let embedder = Arc::new(
Embedder::new(embedder_options.clone()) Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from) .map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?, .map_err(crate::Error::from)?,
); );
Ok((name, (embedder, prompt))) Ok((name, (embedder, prompt)))
}, })
)
.collect(); .collect();
res.map(EmbeddingConfigs::new) res.map(EmbeddingConfigs::new)
} }
@ -1768,7 +1555,7 @@ mod tests {
// When we search for something that is not in // When we search for something that is not in
// the searchable fields it must not return any document. // the searchable fields it must not return any document.
let result = index.search(&rtxn).query("23").execute().unwrap(); let result = index.search(&rtxn).query("23").execute().unwrap();
assert_eq!(result.documents_ids, Vec::<u32>::new()); assert!(result.documents_ids.is_empty());
// When we search for something that is in the searchable fields // When we search for something that is in the searchable fields
// we must find the appropriate document. // we must find the appropriate document.

View File

@ -152,10 +152,6 @@ impl EmbeddingConfigs {
&self.0 &self.0
} }
pub fn into_inner(self) -> HashMap<String, (Arc<Embedder>, Arc<Prompt>)> {
self.0
}
/// Get the name of the default embedder configuration. /// Get the name of the default embedder configuration.
/// ///
/// The default embedder is determined as follows: /// The default embedder is determined as follows:

View File

@ -1,119 +1,51 @@
use std::collections::{BTreeMap, BTreeSet}; use std::collections::{BTreeMap, BTreeSet};
use deserr::{take_cf_content, DeserializeError, Deserr, Sequence};
use obkv::KvReader; use obkv::KvReader;
use serde_json::{from_slice, Value}; use serde_json::{from_slice, Value};
use super::Embedding; use super::Embedding;
use crate::index::IndexEmbeddingConfig;
use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::{DocumentId, FieldId, InternalError, UserError}; use crate::{FieldId, InternalError, UserError};
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
#[derive(serde::Serialize, Debug)] #[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(untagged)] #[serde(untagged)]
pub enum Vectors { pub enum Vectors {
ImplicitlyUserProvided(VectorOrArrayOfVectors), ImplicitlyUserProvided(VectorOrArrayOfVectors),
Explicit(ExplicitVectors), Explicit(ExplicitVectors),
} }
impl<E: DeserializeError> Deserr<E> for Vectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
match value {
deserr::Value::Sequence(_) | deserr::Value::Null => {
Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value(
value, location,
)?))
}
deserr::Value::Map(_) => {
Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?))
}
value => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[
deserr::ValueKind::Sequence,
deserr::ValueKind::Map,
deserr::ValueKind::Null,
],
},
location,
))),
}
}
}
impl Vectors { impl Vectors {
pub fn must_regenerate(&self) -> bool { pub fn into_array_of_vectors(self) -> Vec<Embedding> {
match self { match self {
Vectors::ImplicitlyUserProvided(_) => false, Vectors::ImplicitlyUserProvided(embeddings)
Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate, | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => {
} embeddings.into_array_of_vectors().unwrap_or_default()
}
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
match self {
Vectors::ImplicitlyUserProvided(embeddings) => {
Some(embeddings.into_array_of_vectors().unwrap_or_default())
}
Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => {
embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default())
} }
} }
} }
} }
#[derive(serde::Serialize, Deserr, Debug)] #[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ExplicitVectors { pub struct ExplicitVectors {
#[serde(default)] pub embeddings: VectorOrArrayOfVectors,
#[deserr(default)] pub user_provided: bool,
pub embeddings: Option<VectorOrArrayOfVectors>,
pub regenerate: bool,
}
pub enum VectorState {
Inline(Vectors),
Manual,
Generated,
}
impl VectorState {
pub fn must_regenerate(&self) -> bool {
match self {
VectorState::Inline(vectors) => vectors.must_regenerate(),
VectorState::Manual => false,
VectorState::Generated => true,
}
}
}
pub enum VectorsState {
NoVectorsFid,
NoVectorsFieldInDocument,
Vectors(BTreeMap<String, Vectors>),
} }
pub struct ParsedVectorsDiff { pub struct ParsedVectorsDiff {
old: BTreeMap<String, VectorState>, pub old: Option<BTreeMap<String, Vectors>>,
new: VectorsState, pub new: Option<BTreeMap<String, Vectors>>,
} }
impl ParsedVectorsDiff { impl ParsedVectorsDiff {
pub fn new( pub fn new(
docid: DocumentId,
embedders_configs: &[IndexEmbeddingConfig],
documents_diff: KvReader<'_, FieldId>, documents_diff: KvReader<'_, FieldId>,
old_vectors_fid: Option<FieldId>, old_vectors_fid: Option<FieldId>,
new_vectors_fid: Option<FieldId>, new_vectors_fid: Option<FieldId>,
) -> Result<Self, Error> { ) -> Result<Self, Error> {
let mut old = match old_vectors_fid let old = match old_vectors_fid
.and_then(|vectors_fid| documents_diff.get(vectors_fid)) .and_then(|vectors_fid| documents_diff.get(vectors_fid))
.map(KvReaderDelAdd::new) .map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
@ -129,84 +61,48 @@ impl ParsedVectorsDiff {
return Err(error); return Err(error);
} }
} }
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); .flatten();
for embedding_config in embedders_configs { let new = new_vectors_fid
if embedding_config.user_provided.contains(docid) { .and_then(|vectors_fid| documents_diff.get(vectors_fid))
old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); .map(KvReaderDelAdd::new)
} .map(|obkv| to_vector_map(obkv, DelAdd::Addition))
} .transpose()?
.flatten();
let new = 'new: {
let Some(new_vectors_fid) = new_vectors_fid else {
break 'new VectorsState::NoVectorsFid;
};
let Some(bytes) = documents_diff.get(new_vectors_fid) else {
break 'new VectorsState::NoVectorsFieldInDocument;
};
let obkv = KvReaderDelAdd::new(bytes);
match to_vector_map(obkv, DelAdd::Addition)? {
Some(new) => VectorsState::Vectors(new),
None => VectorsState::NoVectorsFieldInDocument,
}
};
Ok(Self { old, new }) Ok(Self { old, new })
} }
pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) { pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) {
let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); let old = self.old.as_mut().and_then(|old| old.remove(embedder_name));
let state_from_old = match old { let new = self.new.as_mut().and_then(|new| new.remove(embedder_name));
// assume a userProvided is still userProvided
VectorState::Manual => VectorState::Manual,
// generated is still generated
VectorState::Generated => VectorState::Generated,
// weird case that shouldn't happen were the previous docs version is inline,
// but it was removed in the new version
// Since it is not in the new version, we switch to generated
VectorState::Inline(_) => VectorState::Generated,
};
let new = match &mut self.new {
VectorsState::Vectors(new) => {
new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old)
}
_ =>
// if no `_vectors` field is present in the new document,
// the state depends on the previous version of the document
{
state_from_old
}
};
(old, new) (old, new)
} }
} }
pub struct ParsedVectors(pub BTreeMap<String, Vectors>); pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
impl<E: DeserializeError> Deserr<E> for ParsedVectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
let value = <BTreeMap<String, Vectors>>::deserialize_from_value(value, location)?;
Ok(ParsedVectors(value))
}
}
impl ParsedVectors { impl ParsedVectors {
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> { pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?; let Ok(value) = from_slice(value) else {
deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error }) let value = from_slice(value).map_err(Error::InternalSerdeJson)?;
return Err(Error::InvalidMap(value));
};
Ok(ParsedVectors(value))
} }
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) { pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|k, _v| !embedders.contains(k)) self.0.retain(|k, v| match v {
Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
*user_provided
// if the embedder is not in the config, then never touch it
|| !embedders.contains(k)
}
});
} }
} }
pub enum Error { pub enum Error {
InvalidMap(Value), InvalidMap(Value),
InvalidEmbedderConf { error: deserr::errors::JsonError },
InternalSerdeJson(serde_json::Error), InternalSerdeJson(serde_json::Error),
} }
@ -216,12 +112,6 @@ impl Error {
Error::InvalidMap(value) => { Error::InvalidMap(value) => {
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value }) crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
} }
Error::InvalidEmbedderConf { error } => {
crate::Error::UserError(UserError::InvalidVectorsEmbedderConf {
document_id,
error,
})
}
Error::InternalSerdeJson(error) => { Error::InternalSerdeJson(error) => {
crate::Error::InternalError(InternalError::SerdeJson(error)) crate::Error::InternalError(InternalError::SerdeJson(error))
} }
@ -242,84 +132,13 @@ fn to_vector_map(
} }
/// Represents either a vector or an array of multiple vectors. /// Represents either a vector or an array of multiple vectors.
#[derive(serde::Serialize, Debug)] #[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(transparent)] #[serde(transparent)]
pub struct VectorOrArrayOfVectors { pub struct VectorOrArrayOfVectors {
#[serde(with = "either::serde_untagged_optional")] #[serde(with = "either::serde_untagged_optional")]
inner: Option<either::Either<Vec<Embedding>, Embedding>>, inner: Option<either::Either<Vec<Embedding>, Embedding>>,
} }
impl<E: DeserializeError> Deserr<E> for VectorOrArrayOfVectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
match value {
deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }),
deserr::Value::Sequence(seq) => {
let mut iter = seq.into_iter();
match iter.next().map(|v| v.into_value()) {
None => {
// With the strange way serde serialize the `Either`, we must send the left part
// otherwise it'll consider we returned [[]]
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) })
}
Some(val @ deserr::Value::Sequence(_)) => {
let first = Embedding::deserialize_from_value(val, location.push_index(0))?;
let mut collect = vec![first];
let mut tail = iter
.enumerate()
.map(|(i, v)| {
Embedding::deserialize_from_value(
v.into_value(),
location.push_index(i + 1),
)
})
.collect::<Result<Vec<_>, _>>()?;
collect.append(&mut tail);
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) })
}
Some(
val @ deserr::Value::Integer(_)
| val @ deserr::Value::NegativeInteger(_)
| val @ deserr::Value::Float(_),
) => {
let first = <f32>::deserialize_from_value(val, location.push_index(0))?;
let mut embedding = iter
.enumerate()
.map(|(i, v)| {
<f32>::deserialize_from_value(
v.into_value(),
location.push_index(i + 1),
)
})
.collect::<Result<Vec<_>, _>>()?;
embedding.insert(0, first);
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) })
}
Some(value) => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float],
},
location.push_index(0),
))),
}
}
value => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null],
},
location,
))),
}
}
}
impl VectorOrArrayOfVectors { impl VectorOrArrayOfVectors {
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> { pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
match self.inner? { match self.inner? {
@ -331,41 +150,21 @@ impl VectorOrArrayOfVectors {
pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self { pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self {
Self { inner: Some(either::Either::Left(array_of_vec)) } Self { inner: Some(either::Either::Left(array_of_vec)) }
} }
pub fn from_vector(vec: Embedding) -> Self {
Self { inner: Some(either::Either::Right(vec)) }
}
}
impl From<Embedding> for VectorOrArrayOfVectors {
fn from(vec: Embedding) -> Self {
Self::from_vector(vec)
}
}
impl From<Vec<Embedding>> for VectorOrArrayOfVectors {
fn from(vec: Vec<Embedding>) -> Self {
Self::from_array_of_vectors(vec)
}
} }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::VectorOrArrayOfVectors; use super::VectorOrArrayOfVectors;
fn embedding_from_str(s: &str) -> Result<VectorOrArrayOfVectors, deserr::errors::JsonError> {
let value: serde_json::Value = serde_json::from_str(s).unwrap();
deserr::deserialize(value)
}
#[test] #[test]
fn array_of_vectors() { fn array_of_vectors() {
let null = embedding_from_str("null").unwrap(); let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap();
let empty = embedding_from_str("[]").unwrap(); let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap();
let one = embedding_from_str("[0.1]").unwrap(); let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap();
let two = embedding_from_str("[0.1, 0.2]").unwrap(); let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap();
let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap(); let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap();
let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); let two_vecs: VectorOrArrayOfVectors =
serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null"); insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]"); insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");

View File

@ -1,5 +1,4 @@
use deserr::Deserr; use deserr::Deserr;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::rest::InputType; use super::rest::InputType;
@ -73,238 +72,6 @@ pub fn check_unset<T>(
} }
} }
/// Indicates what action should take place during a reindexing operation for an embedder
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ReindexAction {
/// An indexing operation should take place for this embedder, keeping existing vectors
/// and checking whether the document template changed or not
RegeneratePrompts,
/// An indexing operation should take place for all documents for this embedder, removing existing vectors
/// (except userProvided ones)
FullReindex,
}
pub enum SettingsDiff {
Remove,
Reindex { action: ReindexAction, updated_settings: EmbeddingSettings },
UpdateWithoutReindex { updated_settings: EmbeddingSettings },
}
pub enum EmbedderAction {
WriteBackToDocuments(WriteBackToDocuments),
Reindex(ReindexAction),
}
pub struct WriteBackToDocuments {
pub embedder_id: u8,
pub user_provided: RoaringBitmap,
}
impl SettingsDiff {
pub fn from_settings(old: EmbeddingSettings, new: Setting<EmbeddingSettings>) -> Self {
match new {
Setting::Set(new) => {
let EmbeddingSettings {
mut source,
mut model,
mut revision,
mut api_key,
mut dimensions,
mut document_template,
mut url,
mut query,
mut input_field,
mut path_to_embeddings,
mut embedding_object,
mut input_type,
mut distribution,
} = old;
let EmbeddingSettings {
source: new_source,
model: new_model,
revision: new_revision,
api_key: new_api_key,
dimensions: new_dimensions,
document_template: new_document_template,
url: new_url,
query: new_query,
input_field: new_input_field,
path_to_embeddings: new_path_to_embeddings,
embedding_object: new_embedding_object,
input_type: new_input_type,
distribution: new_distribution,
} = new;
let mut reindex_action = None;
// **Warning**: do not use short-circuiting || here, we want all these operations applied
if source.apply(new_source) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
// when the source changes, we need to reapply the default settings for the new source
apply_default_for_source(
&source,
&mut model,
&mut revision,
&mut dimensions,
&mut url,
&mut query,
&mut input_field,
&mut path_to_embeddings,
&mut embedding_object,
&mut input_type,
&mut document_template,
)
}
if model.apply(new_model) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if revision.apply(new_revision) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if dimensions.apply(new_dimensions) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if url.apply(new_url) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if query.apply(new_query) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if input_field.apply(new_input_field) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if path_to_embeddings.apply(new_path_to_embeddings) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if embedding_object.apply(new_embedding_object) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if input_type.apply(new_input_type) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if document_template.apply(new_document_template) {
ReindexAction::push_action(
&mut reindex_action,
ReindexAction::RegeneratePrompts,
);
}
distribution.apply(new_distribution);
api_key.apply(new_api_key);
let updated_settings = EmbeddingSettings {
source,
model,
revision,
api_key,
dimensions,
document_template,
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
distribution,
};
match reindex_action {
Some(action) => Self::Reindex { action, updated_settings },
None => Self::UpdateWithoutReindex { updated_settings },
}
}
Setting::Reset => Self::Remove,
Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old },
}
}
}
impl ReindexAction {
fn push_action(this: &mut Option<Self>, other: Self) {
*this = match (*this, other) {
(_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex),
(Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex),
(_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts),
}
}
}
#[allow(clippy::too_many_arguments)] // private function
fn apply_default_for_source(
source: &Setting<EmbedderSource>,
model: &mut Setting<String>,
revision: &mut Setting<String>,
dimensions: &mut Setting<usize>,
url: &mut Setting<String>,
query: &mut Setting<serde_json::Value>,
input_field: &mut Setting<Vec<String>>,
path_to_embeddings: &mut Setting<Vec<String>>,
embedding_object: &mut Setting<Vec<String>>,
input_type: &mut Setting<InputType>,
document_template: &mut Setting<String>,
) {
match source {
Setting::Set(EmbedderSource::HuggingFace) => {
*model = Setting::Reset;
*revision = Setting::Reset;
*dimensions = Setting::NotSet;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::Ollama) => {
*model = Setting::Reset;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => {
*model = Setting::Reset;
*revision = Setting::NotSet;
*dimensions = Setting::NotSet;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::Rest) => {
*model = Setting::NotSet;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::Reset;
*query = Setting::Reset;
*input_field = Setting::Reset;
*path_to_embeddings = Setting::Reset;
*embedding_object = Setting::Reset;
*input_type = Setting::Reset;
}
Setting::Set(EmbedderSource::UserProvided) => {
*model = Setting::NotSet;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
*document_template = Setting::NotSet;
}
Setting::NotSet => {}
}
}
pub fn check_set<T>( pub fn check_set<T>(
key: &Setting<T>, key: &Setting<T>,
field: &'static str, field: &'static str,
@ -443,6 +210,66 @@ impl EmbeddingSettings {
*model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned())
} }
} }
pub(crate) fn apply_and_need_reindex(
old: &mut Setting<EmbeddingSettings>,
new: Setting<EmbeddingSettings>,
) -> bool {
match (old, new) {
(
Setting::Set(EmbeddingSettings {
source: old_source,
model: old_model,
revision: old_revision,
api_key: old_api_key,
dimensions: old_dimensions,
document_template: old_document_template,
url: old_url,
query: old_query,
input_field: old_input_field,
path_to_embeddings: old_path_to_embeddings,
embedding_object: old_embedding_object,
input_type: old_input_type,
distribution: old_distribution,
}),
Setting::Set(EmbeddingSettings {
source: new_source,
model: new_model,
revision: new_revision,
api_key: new_api_key,
dimensions: new_dimensions,
document_template: new_document_template,
url: new_url,
query: new_query,
input_field: new_input_field,
path_to_embeddings: new_path_to_embeddings,
embedding_object: new_embedding_object,
input_type: new_input_type,
distribution: new_distribution,
}),
) => {
let mut needs_reindex = false;
needs_reindex |= old_source.apply(new_source);
needs_reindex |= old_model.apply(new_model);
needs_reindex |= old_revision.apply(new_revision);
needs_reindex |= old_dimensions.apply(new_dimensions);
needs_reindex |= old_document_template.apply(new_document_template);
needs_reindex |= old_url.apply(new_url);
needs_reindex |= old_query.apply(new_query);
needs_reindex |= old_input_field.apply(new_input_field);
needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings);
needs_reindex |= old_embedding_object.apply(new_embedding_object);
needs_reindex |= old_input_type.apply(new_input_type);
old_distribution.apply(new_distribution);
old_api_key.apply(new_api_key);
needs_reindex
}
(Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false,
_ => true,
}
}
} }
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]

Some files were not shown because too many files have changed in this diff Show More