mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-22 14:21:03 +00:00
Compare commits
30 Commits
fix/update
...
fragment-f
Author | SHA1 | Date | |
---|---|---|---|
776e55d209 | |||
3362fb8476 | |||
6d93b36279 | |||
982e989886 | |||
0014ed3114 | |||
ab07e9480e | |||
00e957051e | |||
f244439b4f | |||
30fd546c12 | |||
a930977460 | |||
a3b8c2b71f | |||
39f808714d | |||
8adf6141e0 | |||
df3f282e4d | |||
d81855015b | |||
feb53104e5 | |||
881c37393f | |||
9e98a25e45 | |||
fb73b83abe | |||
29b74424ad | |||
b4cafec8b3 | |||
d43cd40807 | |||
0301d8f239 | |||
2d45124d9b | |||
40e7284d70 | |||
4d8d34cc93 | |||
5cced0af02 | |||
9c60e9689f | |||
2052537681 | |||
a9bb64c55a |
2
.gitignore
vendored
2
.gitignore
vendored
@ -5,7 +5,7 @@
|
||||
**/*.json_lines
|
||||
**/*.rs.bk
|
||||
/*.mdb
|
||||
/*.ms
|
||||
/data.ms
|
||||
/snapshots
|
||||
/dumps
|
||||
/bench
|
||||
|
5
Cargo.lock
generated
5
Cargo.lock
generated
@ -3775,7 +3775,6 @@ dependencies = [
|
||||
"meili-snap",
|
||||
"meilisearch-auth",
|
||||
"meilisearch-types",
|
||||
"memmap2",
|
||||
"mimalloc",
|
||||
"mime",
|
||||
"mopa-maintained",
|
||||
@ -3909,9 +3908,9 @@ checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
|
||||
|
||||
[[package]]
|
||||
name = "memmap2"
|
||||
version = "0.9.7"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "483758ad303d734cec05e5c12b41d7e93e6a6390c5e9dae6bdeb7c1259012d28"
|
||||
checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"stable_deref_trait",
|
||||
|
@ -14,7 +14,7 @@ license.workspace = true
|
||||
anyhow = "1.0.98"
|
||||
bumpalo = "3.18.1"
|
||||
csv = "1.3.1"
|
||||
memmap2 = "0.9.7"
|
||||
memmap2 = "0.9.5"
|
||||
milli = { path = "../milli" }
|
||||
mimalloc = { version = "0.1.47", default-features = false }
|
||||
serde_json = { version = "1.0.140", features = ["preserve_order"] }
|
||||
@ -55,3 +55,4 @@ harness = false
|
||||
[[bench]]
|
||||
name = "sort"
|
||||
harness = false
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
use std::fs::File;
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::v2_to_v3::CompatV2ToV3;
|
||||
@ -95,10 +94,6 @@ impl CompatIndexV1ToV2 {
|
||||
self.from.documents().map(|it| Box::new(it) as Box<dyn Iterator<Item = _>>)
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
self.from.documents_file()
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<v2::settings::Settings<v2::settings::Checked>> {
|
||||
Ok(v2::settings::Settings::<v2::settings::Unchecked>::from(self.from.settings()?).check())
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
use std::fs::File;
|
||||
use std::str::FromStr;
|
||||
|
||||
use time::OffsetDateTime;
|
||||
@ -123,13 +122,6 @@ impl CompatIndexV2ToV3 {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
match self {
|
||||
CompatIndexV2ToV3::V2(v2) => v2.documents_file(),
|
||||
CompatIndexV2ToV3::Compat(compat) => compat.documents_file(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<v3::Settings<v3::Checked>> {
|
||||
let settings = match self {
|
||||
CompatIndexV2ToV3::V2(from) => from.settings()?,
|
||||
|
@ -1,5 +1,3 @@
|
||||
use std::fs::File;
|
||||
|
||||
use super::v2_to_v3::{CompatIndexV2ToV3, CompatV2ToV3};
|
||||
use super::v4_to_v5::CompatV4ToV5;
|
||||
use crate::reader::{v3, v4, UpdateFile};
|
||||
@ -254,13 +252,6 @@ impl CompatIndexV3ToV4 {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
match self {
|
||||
CompatIndexV3ToV4::V3(v3) => v3.documents_file(),
|
||||
CompatIndexV3ToV4::Compat(compat) => compat.documents_file(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<v4::Settings<v4::Checked>> {
|
||||
Ok(match self {
|
||||
CompatIndexV3ToV4::V3(v3) => {
|
||||
|
@ -1,5 +1,3 @@
|
||||
use std::fs::File;
|
||||
|
||||
use super::v3_to_v4::{CompatIndexV3ToV4, CompatV3ToV4};
|
||||
use super::v5_to_v6::CompatV5ToV6;
|
||||
use crate::reader::{v4, v5, Document};
|
||||
@ -243,13 +241,6 @@ impl CompatIndexV4ToV5 {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
match self {
|
||||
CompatIndexV4ToV5::V4(v4) => v4.documents_file(),
|
||||
CompatIndexV4ToV5::Compat(compat) => compat.documents_file(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<v5::Settings<v5::Checked>> {
|
||||
match self {
|
||||
CompatIndexV4ToV5::V4(v4) => Ok(v5::Settings::from(v4.settings()?).check()),
|
||||
|
@ -1,4 +1,3 @@
|
||||
use std::fs::File;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::str::FromStr;
|
||||
|
||||
@ -244,13 +243,6 @@ impl CompatIndexV5ToV6 {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
match self {
|
||||
CompatIndexV5ToV6::V5(v5) => v5.documents_file(),
|
||||
CompatIndexV5ToV6::Compat(compat) => compat.documents_file(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<v6::Settings<v6::Checked>> {
|
||||
match self {
|
||||
CompatIndexV5ToV6::V5(v5) => Ok(v6::Settings::from(v5.settings()?).check()),
|
||||
|
@ -192,14 +192,6 @@ impl DumpIndexReader {
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a file in the NDJSON format containing all the documents of the index
|
||||
pub fn documents_file(&self) -> &File {
|
||||
match self {
|
||||
DumpIndexReader::Current(v6) => v6.documents_file(),
|
||||
DumpIndexReader::Compat(compat) => compat.documents_file(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<v6::Settings<v6::Checked>> {
|
||||
match self {
|
||||
DumpIndexReader::Current(v6) => v6.settings(),
|
||||
|
@ -72,10 +72,6 @@ impl V1IndexReader {
|
||||
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
self.documents.get_ref()
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<self::settings::Settings> {
|
||||
Ok(serde_json::from_reader(&mut self.settings)?)
|
||||
}
|
||||
|
@ -203,10 +203,6 @@ impl V2IndexReader {
|
||||
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
self.documents.get_ref()
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<Settings<Checked>> {
|
||||
Ok(self.settings.clone())
|
||||
}
|
||||
|
@ -215,10 +215,6 @@ impl V3IndexReader {
|
||||
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
self.documents.get_ref()
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<Settings<Checked>> {
|
||||
Ok(self.settings.clone())
|
||||
}
|
||||
|
@ -210,10 +210,6 @@ impl V4IndexReader {
|
||||
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
self.documents.get_ref()
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<Settings<Checked>> {
|
||||
Ok(self.settings.clone())
|
||||
}
|
||||
|
@ -247,10 +247,6 @@ impl V5IndexReader {
|
||||
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
self.documents.get_ref()
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<Settings<Checked>> {
|
||||
Ok(self.settings.clone())
|
||||
}
|
||||
|
@ -284,10 +284,6 @@ impl V6IndexReader {
|
||||
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
|
||||
}
|
||||
|
||||
pub fn documents_file(&self) -> &File {
|
||||
self.documents.get_ref()
|
||||
}
|
||||
|
||||
pub fn settings(&mut self) -> Result<Settings<Checked>> {
|
||||
let mut settings: Settings<Unchecked> = serde_json::from_reader(&mut self.settings)?;
|
||||
patch_embedders(&mut settings);
|
||||
|
@ -60,7 +60,7 @@ use nom::combinator::{cut, eof, map, opt};
|
||||
use nom::multi::{many0, separated_list1};
|
||||
use nom::number::complete::recognize_float;
|
||||
use nom::sequence::{delimited, preceded, terminated, tuple};
|
||||
use nom::Finish;
|
||||
use nom::{Finish, Slice};
|
||||
use nom_locate::LocatedSpan;
|
||||
pub(crate) use value::parse_value;
|
||||
use value::word_exact;
|
||||
@ -121,6 +121,16 @@ impl<'a> Token<'a> {
|
||||
Err(Error::new_from_kind(self.span, ErrorKind::NonFiniteFloat))
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the token by a delimiter and return an iterator of tokens.
|
||||
/// Each token in the iterator will have its own span that corresponds to a slice of the original token's span.
|
||||
pub fn split(&self, delimiter: &'a str) -> impl Iterator<Item = Token<'a>> + '_ {
|
||||
let original_addr = self.value().as_ptr() as usize;
|
||||
self.value().split(delimiter).map(move |part| {
|
||||
let offset = part.as_ptr() as usize - original_addr;
|
||||
Token::new(self.span.slice(offset..offset + part.len()), Some(part.to_string()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<Span<'a>> for Token<'a> {
|
||||
@ -179,6 +189,25 @@ impl<'a> FilterCondition<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn use_vector_filter(&self) -> Option<&Token> {
|
||||
match self {
|
||||
FilterCondition::Condition { fid, op: _ } => {
|
||||
if fid.value().starts_with("_vectors.") || fid.value() == "_vectors" {
|
||||
Some(fid)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
FilterCondition::Not(this) => this.use_vector_filter(),
|
||||
FilterCondition::Or(seq) | FilterCondition::And(seq) => {
|
||||
seq.iter().find_map(|filter| filter.use_vector_filter())
|
||||
}
|
||||
FilterCondition::GeoLowerThan { .. }
|
||||
| FilterCondition::GeoBoundingBox { .. }
|
||||
| FilterCondition::In { .. } => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fids(&self, depth: usize) -> Box<dyn Iterator<Item = &Token> + '_> {
|
||||
if depth == 0 {
|
||||
return Box::new(std::iter::empty());
|
||||
@ -1043,4 +1072,103 @@ pub mod tests {
|
||||
let token: Token = s.into();
|
||||
assert_eq!(token.value(), s);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split() {
|
||||
let s = "test string that should not be parsed\n newline";
|
||||
let token: Token = s.into();
|
||||
let parts: Vec<_> = token.split(" ").collect();
|
||||
insta::assert_snapshot!(format!("{parts:#?}"), @r#"
|
||||
[
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 0,
|
||||
line: 1,
|
||||
fragment: "test",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"test",
|
||||
),
|
||||
},
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 5,
|
||||
line: 1,
|
||||
fragment: "string",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"string",
|
||||
),
|
||||
},
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 12,
|
||||
line: 1,
|
||||
fragment: "that",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"that",
|
||||
),
|
||||
},
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 17,
|
||||
line: 1,
|
||||
fragment: "should",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"should",
|
||||
),
|
||||
},
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 24,
|
||||
line: 1,
|
||||
fragment: "not",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"not",
|
||||
),
|
||||
},
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 28,
|
||||
line: 1,
|
||||
fragment: "be",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"be",
|
||||
),
|
||||
},
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 31,
|
||||
line: 1,
|
||||
fragment: "parsed\n",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"parsed\n",
|
||||
),
|
||||
},
|
||||
Token {
|
||||
span: LocatedSpan {
|
||||
offset: 39,
|
||||
line: 2,
|
||||
fragment: "newline",
|
||||
extra: "test string that should not be parsed\n newline",
|
||||
},
|
||||
value: Some(
|
||||
"newline",
|
||||
),
|
||||
},
|
||||
]
|
||||
"#);
|
||||
}
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ flate2 = "1.1.2"
|
||||
indexmap = "2.9.0"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
memmap2 = "0.9.7"
|
||||
memmap2 = "0.9.5"
|
||||
page_size = "0.6.0"
|
||||
rayon = "1.10.0"
|
||||
roaring = { version = "0.10.12", features = ["serde"] }
|
||||
|
@ -20,7 +20,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
||||
|
||||
let IndexScheduler {
|
||||
cleanup_enabled: _,
|
||||
experimental_no_edition_2024_for_dumps: _,
|
||||
processing_tasks,
|
||||
env,
|
||||
version,
|
||||
|
@ -168,9 +168,6 @@ pub struct IndexScheduler {
|
||||
/// Whether we should automatically cleanup the task queue or not.
|
||||
pub(crate) cleanup_enabled: bool,
|
||||
|
||||
/// Whether we should use the old document indexer or the new one.
|
||||
pub(crate) experimental_no_edition_2024_for_dumps: bool,
|
||||
|
||||
/// The webhook url we should send tasks to after processing every batches.
|
||||
pub(crate) webhook_url: Option<String>,
|
||||
/// The Authorization header to send to the webhook URL.
|
||||
@ -213,7 +210,6 @@ impl IndexScheduler {
|
||||
|
||||
index_mapper: self.index_mapper.clone(),
|
||||
cleanup_enabled: self.cleanup_enabled,
|
||||
experimental_no_edition_2024_for_dumps: self.experimental_no_edition_2024_for_dumps,
|
||||
webhook_url: self.webhook_url.clone(),
|
||||
webhook_authorization_header: self.webhook_authorization_header.clone(),
|
||||
embedders: self.embedders.clone(),
|
||||
@ -300,9 +296,6 @@ impl IndexScheduler {
|
||||
index_mapper,
|
||||
env,
|
||||
cleanup_enabled: options.cleanup_enabled,
|
||||
experimental_no_edition_2024_for_dumps: options
|
||||
.indexer_config
|
||||
.experimental_no_edition_2024_for_dumps,
|
||||
webhook_url: options.webhook_url,
|
||||
webhook_authorization_header: options.webhook_authorization_header,
|
||||
embedders: Default::default(),
|
||||
@ -601,11 +594,6 @@ impl IndexScheduler {
|
||||
Ok(nbr_index_processing_tasks > 0)
|
||||
}
|
||||
|
||||
/// Whether the index should use the old document indexer.
|
||||
pub fn no_edition_2024_for_dumps(&self) -> bool {
|
||||
self.experimental_no_edition_2024_for_dumps
|
||||
}
|
||||
|
||||
/// Return the tasks matching the query from the user's point of view along
|
||||
/// with the total number of tasks matching the query, ignoring from and limit.
|
||||
///
|
||||
|
@ -736,7 +736,7 @@ fn test_document_addition_mixed_rights_with_index() {
|
||||
#[test]
|
||||
fn test_document_addition_mixed_right_without_index_starts_with_cant_create() {
|
||||
// We're going to autobatch multiple document addition.
|
||||
// - The index does not exists
|
||||
// - The index does not exist
|
||||
// - The first document addition don't have the right to create an index
|
||||
// - The second do. They should not batch together.
|
||||
// - The second should batch with everything else as it's going to create an index.
|
||||
|
@ -24,7 +24,7 @@ enum-iterator = "2.1.0"
|
||||
file-store = { path = "../file-store" }
|
||||
flate2 = "1.1.2"
|
||||
fst = "0.4.7"
|
||||
memmap2 = "0.9.7"
|
||||
memmap2 = "0.9.5"
|
||||
milli = { path = "../milli" }
|
||||
roaring = { version = "0.10.12", features = ["serde"] }
|
||||
rustc-hash = "2.1.1"
|
||||
|
@ -50,7 +50,6 @@ jsonwebtoken = "9.3.1"
|
||||
lazy_static = "1.5.0"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
memmap2 = "0.9.7"
|
||||
mimalloc = { version = "0.1.47", default-features = false }
|
||||
mime = "0.3.17"
|
||||
num_cpus = "1.17.0"
|
||||
|
@ -203,7 +203,6 @@ struct Infos {
|
||||
experimental_composite_embedders: bool,
|
||||
experimental_embedding_cache_entries: usize,
|
||||
experimental_no_snapshot_compaction: bool,
|
||||
experimental_no_edition_2024_for_dumps: bool,
|
||||
experimental_no_edition_2024_for_settings: bool,
|
||||
gpu_enabled: bool,
|
||||
db_path: bool,
|
||||
@ -294,7 +293,6 @@ impl Infos {
|
||||
max_indexing_threads,
|
||||
skip_index_budget: _,
|
||||
experimental_no_edition_2024_for_settings,
|
||||
experimental_no_edition_2024_for_dumps,
|
||||
} = indexer_options;
|
||||
|
||||
let RuntimeTogglableFeatures {
|
||||
@ -331,7 +329,6 @@ impl Infos {
|
||||
experimental_composite_embedders: composite_embedders,
|
||||
experimental_embedding_cache_entries,
|
||||
experimental_no_snapshot_compaction,
|
||||
experimental_no_edition_2024_for_dumps,
|
||||
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
|
||||
db_path: db_path != PathBuf::from("./data.ms"),
|
||||
import_dump: import_dump.is_some(),
|
||||
|
@ -30,7 +30,6 @@ use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest};
|
||||
use analytics::Analytics;
|
||||
use anyhow::bail;
|
||||
use bumpalo::Bump;
|
||||
use error::PayloadError;
|
||||
use extractors::payload::PayloadConfig;
|
||||
use index_scheduler::versioning::Versioning;
|
||||
@ -39,7 +38,6 @@ use meilisearch_auth::{open_auth_store_env, AuthController};
|
||||
use meilisearch_types::milli::constants::VERSION_MAJOR;
|
||||
use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::progress::{EmbedderStats, Progress};
|
||||
use meilisearch_types::milli::update::new::indexer;
|
||||
use meilisearch_types::milli::update::{
|
||||
default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig,
|
||||
};
|
||||
@ -535,7 +533,7 @@ fn import_dump(
|
||||
let mut index_reader = index_reader?;
|
||||
let metadata = index_reader.metadata();
|
||||
let uid = metadata.uid.clone();
|
||||
tracing::info!("Importing index `{uid}`.");
|
||||
tracing::info!("Importing index `{}`.", metadata.uid);
|
||||
|
||||
let date = Some((metadata.created_at, metadata.updated_at));
|
||||
let index = index_scheduler.create_raw_index(&metadata.uid, date)?;
|
||||
@ -554,100 +552,48 @@ fn import_dump(
|
||||
apply_settings_to_builder(&settings, &mut builder);
|
||||
let embedder_stats: Arc<EmbedderStats> = Default::default();
|
||||
builder.execute(&|| false, &progress, embedder_stats.clone())?;
|
||||
wtxn.commit()?;
|
||||
|
||||
let mut wtxn = index.write_txn()?;
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
if index_scheduler.no_edition_2024_for_dumps() {
|
||||
// 5.3 Import the documents.
|
||||
// 5.3.1 We need to recreate the grenad+obkv format accepted by the index.
|
||||
tracing::info!("Importing the documents.");
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
|
||||
for document in index_reader.documents()? {
|
||||
builder.append_json_object(&document?)?;
|
||||
}
|
||||
|
||||
// This flush the content of the batch builder.
|
||||
let file = builder.into_inner()?.into_inner()?;
|
||||
|
||||
// 5.3.2 We feed it to the milli index.
|
||||
let reader = BufReader::new(file);
|
||||
let reader = DocumentsBatchReader::from_reader(reader)?;
|
||||
|
||||
let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?;
|
||||
let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?;
|
||||
|
||||
let builder = milli::update::IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
indexer_config,
|
||||
IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
},
|
||||
|indexing_step| tracing::trace!("update: {:?}", indexing_step),
|
||||
|| false,
|
||||
&embedder_stats,
|
||||
)?;
|
||||
|
||||
let builder = builder.with_embedders(embedders);
|
||||
|
||||
let (builder, user_result) = builder.add_documents(reader)?;
|
||||
let user_result = user_result?;
|
||||
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
|
||||
builder.execute()?;
|
||||
} else {
|
||||
let db_fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let primary_key = index.primary_key(&rtxn)?;
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
let embedders = index.embedding_configs().embedding_configs(&rtxn)?;
|
||||
let embedders = index_scheduler.embedders(uid.clone(), embedders)?;
|
||||
|
||||
let mmap = unsafe { memmap2::Mmap::map(index_reader.documents_file())? };
|
||||
|
||||
indexer.replace_documents(&mmap)?;
|
||||
|
||||
let indexer_config = index_scheduler.indexer_config();
|
||||
let pool = &indexer_config.thread_pool;
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let (document_changes, mut operation_stats, primary_key) = indexer.into_changes(
|
||||
&indexer_alloc,
|
||||
&index,
|
||||
&rtxn,
|
||||
primary_key,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false, // never stop processing a dump
|
||||
progress.clone(),
|
||||
)?;
|
||||
|
||||
let operation_stats = operation_stats.pop().unwrap();
|
||||
if let Some(error) = operation_stats.error {
|
||||
return Err(error.into());
|
||||
}
|
||||
|
||||
let _congestion = indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
pool,
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
primary_key,
|
||||
&document_changes,
|
||||
embedders,
|
||||
&|| false, // never stop processing a dump
|
||||
&progress,
|
||||
&embedder_stats,
|
||||
)?;
|
||||
// 5.3 Import the documents.
|
||||
// 5.3.1 We need to recreate the grenad+obkv format accepted by the index.
|
||||
tracing::info!("Importing the documents.");
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
|
||||
for document in index_reader.documents()? {
|
||||
builder.append_json_object(&document?)?;
|
||||
}
|
||||
|
||||
// This flush the content of the batch builder.
|
||||
let file = builder.into_inner()?.into_inner()?;
|
||||
|
||||
// 5.3.2 We feed it to the milli index.
|
||||
let reader = BufReader::new(file);
|
||||
let reader = DocumentsBatchReader::from_reader(reader)?;
|
||||
|
||||
let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?;
|
||||
let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?;
|
||||
|
||||
let builder = milli::update::IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
indexer_config,
|
||||
IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
},
|
||||
|indexing_step| tracing::trace!("update: {:?}", indexing_step),
|
||||
|| false,
|
||||
&embedder_stats,
|
||||
)?;
|
||||
|
||||
let builder = builder.with_embedders(embedders);
|
||||
|
||||
let (builder, user_result) = builder.add_documents(reader)?;
|
||||
let user_result = user_result?;
|
||||
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
|
||||
builder.execute()?;
|
||||
wtxn.commit()?;
|
||||
tracing::info!("All documents successfully imported.");
|
||||
|
||||
index_scheduler.refresh_index_stats(&uid)?;
|
||||
}
|
||||
|
||||
|
@ -68,8 +68,6 @@ const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE: &str =
|
||||
const MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES: &str =
|
||||
"MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES";
|
||||
const MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION: &str = "MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION";
|
||||
const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS: &str =
|
||||
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS";
|
||||
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
|
||||
const DEFAULT_DB_PATH: &str = "./data.ms";
|
||||
const DEFAULT_HTTP_ADDR: &str = "localhost:7700";
|
||||
@ -761,15 +759,6 @@ pub struct IndexerOpts {
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS)]
|
||||
#[serde(default)]
|
||||
pub experimental_no_edition_2024_for_settings: bool,
|
||||
|
||||
/// Experimental make dump imports use the old document indexer.
|
||||
///
|
||||
/// When enabled, Meilisearch will use the old document indexer when importing dumps.
|
||||
///
|
||||
/// For more information, see <https://github.com/orgs/meilisearch/discussions/851>.
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS)]
|
||||
#[serde(default)]
|
||||
pub experimental_no_edition_2024_for_dumps: bool,
|
||||
}
|
||||
|
||||
impl IndexerOpts {
|
||||
@ -780,7 +769,6 @@ impl IndexerOpts {
|
||||
max_indexing_threads,
|
||||
skip_index_budget: _,
|
||||
experimental_no_edition_2024_for_settings,
|
||||
experimental_no_edition_2024_for_dumps,
|
||||
} = self;
|
||||
if let Some(max_indexing_memory) = max_indexing_memory.0 {
|
||||
export_to_env_if_not_present(
|
||||
@ -800,12 +788,6 @@ impl IndexerOpts {
|
||||
experimental_no_edition_2024_for_settings.to_string(),
|
||||
);
|
||||
}
|
||||
if experimental_no_edition_2024_for_dumps {
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS,
|
||||
experimental_no_edition_2024_for_dumps.to_string(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -826,7 +808,6 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
|
||||
skip_index_budget: other.skip_index_budget,
|
||||
experimental_no_edition_2024_for_settings: other
|
||||
.experimental_no_edition_2024_for_settings,
|
||||
experimental_no_edition_2024_for_dumps: other.experimental_no_edition_2024_for_dumps,
|
||||
chunk_compression_type: Default::default(),
|
||||
chunk_compression_level: Default::default(),
|
||||
documents_chunk_size: Default::default(),
|
||||
|
@ -138,6 +138,8 @@ pub struct DocumentsFetchAggregator<Method: AggregateMethod> {
|
||||
per_document_id: bool,
|
||||
// if a filter was used
|
||||
per_filter: bool,
|
||||
with_vector_filter: bool,
|
||||
|
||||
// if documents were sorted
|
||||
sort: bool,
|
||||
|
||||
@ -165,6 +167,7 @@ impl<Method: AggregateMethod> Aggregate for DocumentsFetchAggregator<Method> {
|
||||
Box::new(Self {
|
||||
per_document_id: self.per_document_id | new.per_document_id,
|
||||
per_filter: self.per_filter | new.per_filter,
|
||||
with_vector_filter: self.with_vector_filter | new.with_vector_filter,
|
||||
sort: self.sort | new.sort,
|
||||
retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors,
|
||||
max_limit: self.max_limit.max(new.max_limit),
|
||||
@ -249,6 +252,7 @@ pub async fn get_document(
|
||||
retrieve_vectors: param_retrieve_vectors.0,
|
||||
per_document_id: true,
|
||||
per_filter: false,
|
||||
with_vector_filter: false,
|
||||
sort: false,
|
||||
max_limit: 0,
|
||||
max_offset: 0,
|
||||
@ -474,6 +478,10 @@ pub async fn documents_by_query_post(
|
||||
analytics.publish(
|
||||
DocumentsFetchAggregator::<DocumentsPOST> {
|
||||
per_filter: body.filter.is_some(),
|
||||
with_vector_filter: body
|
||||
.filter
|
||||
.as_ref()
|
||||
.is_some_and(|f| f.to_string().contains("_vectors")),
|
||||
sort: body.sort.is_some(),
|
||||
retrieve_vectors: body.retrieve_vectors,
|
||||
max_limit: body.limit,
|
||||
@ -575,6 +583,10 @@ pub async fn get_documents(
|
||||
analytics.publish(
|
||||
DocumentsFetchAggregator::<DocumentsGET> {
|
||||
per_filter: query.filter.is_some(),
|
||||
with_vector_filter: query
|
||||
.filter
|
||||
.as_ref()
|
||||
.is_some_and(|f| f.to_string().contains("_vectors")),
|
||||
sort: query.sort.is_some(),
|
||||
retrieve_vectors: query.retrieve_vectors,
|
||||
max_limit: query.limit,
|
||||
@ -1454,8 +1466,6 @@ fn some_documents<'a, 't: 'a>(
|
||||
document.remove("_vectors");
|
||||
}
|
||||
RetrieveVectors::Retrieve => {
|
||||
// Clippy is simply wrong
|
||||
#[allow(clippy::manual_unwrap_or_default)]
|
||||
let mut vectors = match document.remove("_vectors") {
|
||||
Some(Value::Object(map)) => map,
|
||||
_ => Default::default(),
|
||||
|
@ -40,6 +40,7 @@ pub struct SearchAggregator<Method: AggregateMethod> {
|
||||
// filter
|
||||
filter_with_geo_radius: bool,
|
||||
filter_with_geo_bounding_box: bool,
|
||||
filter_on_vectors: bool,
|
||||
// every time a request has a filter, this field must be incremented by the number of terms it contains
|
||||
filter_sum_of_criteria_terms: usize,
|
||||
// every time a request has a filter, this field must be incremented by one
|
||||
@ -163,6 +164,7 @@ impl<Method: AggregateMethod> SearchAggregator<Method> {
|
||||
let stringified_filters = filter.to_string();
|
||||
ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius(");
|
||||
ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox(");
|
||||
ret.filter_on_vectors = stringified_filters.contains("_vectors");
|
||||
ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count();
|
||||
}
|
||||
|
||||
@ -260,6 +262,7 @@ impl<Method: AggregateMethod> Aggregate for SearchAggregator<Method> {
|
||||
distinct,
|
||||
filter_with_geo_radius,
|
||||
filter_with_geo_bounding_box,
|
||||
filter_on_vectors,
|
||||
filter_sum_of_criteria_terms,
|
||||
filter_total_number_of_criteria,
|
||||
used_syntax,
|
||||
@ -314,6 +317,7 @@ impl<Method: AggregateMethod> Aggregate for SearchAggregator<Method> {
|
||||
// filter
|
||||
self.filter_with_geo_radius |= filter_with_geo_radius;
|
||||
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
|
||||
self.filter_on_vectors |= filter_on_vectors;
|
||||
self.filter_sum_of_criteria_terms =
|
||||
self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms);
|
||||
self.filter_total_number_of_criteria =
|
||||
@ -388,6 +392,7 @@ impl<Method: AggregateMethod> Aggregate for SearchAggregator<Method> {
|
||||
distinct,
|
||||
filter_with_geo_radius,
|
||||
filter_with_geo_bounding_box,
|
||||
filter_on_vectors,
|
||||
filter_sum_of_criteria_terms,
|
||||
filter_total_number_of_criteria,
|
||||
used_syntax,
|
||||
@ -445,6 +450,7 @@ impl<Method: AggregateMethod> Aggregate for SearchAggregator<Method> {
|
||||
"filter": {
|
||||
"with_geoRadius": filter_with_geo_radius,
|
||||
"with_geoBoundingBox": filter_with_geo_bounding_box,
|
||||
"on_vectors": filter_on_vectors,
|
||||
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
||||
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||
},
|
||||
|
@ -336,7 +336,7 @@ impl<Method: AggregateMethod + 'static> Aggregate for TaskFilterAnalytics<Method
|
||||
"link": "https://docs.meilisearch.com/errors#missing_authorization_header"
|
||||
}
|
||||
)),
|
||||
(status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!(
|
||||
(status = 404, description = "The task uid does not exist", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
"message": "Task :taskUid not found.",
|
||||
"code": "task_not_found",
|
||||
@ -430,7 +430,7 @@ async fn cancel_tasks(
|
||||
"link": "https://docs.meilisearch.com/errors#missing_authorization_header"
|
||||
}
|
||||
)),
|
||||
(status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!(
|
||||
(status = 404, description = "The task uid does not exist", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
"message": "Task :taskUid not found.",
|
||||
"code": "task_not_found",
|
||||
@ -611,7 +611,7 @@ async fn get_tasks(
|
||||
"link": "https://docs.meilisearch.com/errors#missing_authorization_header"
|
||||
}
|
||||
)),
|
||||
(status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!(
|
||||
(status = 404, description = "The task uid does not exist", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
"message": "Task :taskUid not found.",
|
||||
"code": "task_not_found",
|
||||
@ -665,7 +665,7 @@ async fn get_task(
|
||||
"link": "https://docs.meilisearch.com/errors#missing_authorization_header"
|
||||
}
|
||||
)),
|
||||
(status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!(
|
||||
(status = 404, description = "The task uid does not exist", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
"message": "Task :taskUid not found.",
|
||||
"code": "task_not_found",
|
||||
|
@ -2078,7 +2078,7 @@ pub(crate) fn parse_filter(
|
||||
})?;
|
||||
|
||||
if let Some(ref filter) = filter {
|
||||
// If the contains operator is used while the contains filter features is not enabled, errors out
|
||||
// If the contains operator is used while the contains filter feature is not enabled, errors out
|
||||
if let Some((token, error)) =
|
||||
filter.use_contains_operator().zip(features.check_contains_filter().err())
|
||||
{
|
||||
@ -2089,6 +2089,18 @@ pub(crate) fn parse_filter(
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ref filter) = filter {
|
||||
// If a vector filter is used while the multi modal feature is not enabled, errors out
|
||||
if let Some((token, error)) =
|
||||
filter.use_vector_filter().zip(features.check_multimodal("using a vector filter").err())
|
||||
{
|
||||
return Err(ResponseError::from_msg(
|
||||
token.as_external_error(error).to_string(),
|
||||
Code::FeatureNotEnabled,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(filter)
|
||||
}
|
||||
|
||||
|
@ -466,7 +466,6 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||
// Having 2 threads makes the tests way faster
|
||||
max_indexing_threads: MaxThreads::from_str("2").unwrap(),
|
||||
experimental_no_edition_2024_for_settings: false,
|
||||
experimental_no_edition_2024_for_dumps: false,
|
||||
},
|
||||
experimental_enable_metrics: false,
|
||||
..Parser::parse_from(None as Option<&str>)
|
||||
|
@ -557,7 +557,7 @@ async fn delete_document_by_filter() {
|
||||
"###);
|
||||
|
||||
let index = shared_does_not_exists_index().await;
|
||||
// index does not exists
|
||||
// index does not exist
|
||||
let (response, _code) =
|
||||
index.delete_document_by_filter_fail(json!({ "filter": "doggo = bernese"})).await;
|
||||
snapshot!(response, @r###"
|
||||
|
@ -304,7 +304,7 @@ async fn search_bad_filter() {
|
||||
let server = Server::new_shared();
|
||||
let index = server.unique_index();
|
||||
// Also, to trigger the error message we need to effectively create the index or else it'll throw an
|
||||
// index does not exists error.
|
||||
// index does not exist error.
|
||||
let (response, _code) = index.create(None).await;
|
||||
server.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
@ -1263,7 +1263,7 @@ async fn search_with_contains_without_enabling_the_feature() {
|
||||
let server = Server::new_shared();
|
||||
let index = server.unique_index();
|
||||
// Also, to trigger the error message we need to effectively create the index or else it'll throw an
|
||||
// index does not exists error.
|
||||
// index does not exist error.
|
||||
let (task, _code) = index.create(None).await;
|
||||
server.wait_task(task.uid()).await.succeeded();
|
||||
|
||||
|
@ -4,8 +4,8 @@ use tempfile::TempDir;
|
||||
|
||||
use super::test_settings_documents_indexing_swapping_and_search;
|
||||
use crate::common::{
|
||||
default_settings, shared_index_with_documents, shared_index_with_nested_documents, Server,
|
||||
DOCUMENTS, NESTED_DOCUMENTS,
|
||||
default_settings, shared_index_for_fragments, shared_index_with_documents,
|
||||
shared_index_with_nested_documents, Server, DOCUMENTS, NESTED_DOCUMENTS,
|
||||
};
|
||||
use crate::json;
|
||||
|
||||
@ -731,3 +731,457 @@ async fn test_filterable_attributes_priority() {
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_all_embedders() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "kefir"
|
||||
},
|
||||
{
|
||||
"name": "echo"
|
||||
},
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "dustin"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 4
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_missing_fragment() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.fragments EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"message": "Index `[uuid]`: Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.\n15:24 _vectors.rest.fragments EXISTS",
|
||||
"code": "invalid_search_filter",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_non_existant_embedder() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.other EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"message": "Index `[uuid]`: The embedder `other` does not exist. Available embedders are: `rest`.\n10:15 _vectors.other EXISTS",
|
||||
"code": "invalid_search_filter",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_all_embedders_user_provided() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
// This one is counterintuitive, but it is the same as the previous one.
|
||||
// It's because userProvided is interpreted as an embedder name
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.userProvided EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"message": "Index `[uuid]`: The embedder `userProvided` does not exist. Available embedders are: `rest`.\n10:22 _vectors.userProvided EXISTS",
|
||||
"code": "invalid_search_filter",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_specific_embedder() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "kefir"
|
||||
},
|
||||
{
|
||||
"name": "echo"
|
||||
},
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "dustin"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 4
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_user_provided() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.userProvided EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "echo"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_specific_fragment() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.fragments.withBreed EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "dustin"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 2
|
||||
}
|
||||
"#);
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.fragments.basic EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "kefir"
|
||||
},
|
||||
{
|
||||
"name": "echo"
|
||||
},
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "dustin"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 4
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_non_existant_fragment() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.fragments.other EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"message": "Index `[uuid]`: The fragment `other` does not exist on embedder `rest`. Available fragments on this embedder are: `basic`, `withBreed`.\n25:30 _vectors.rest.fragments.other EXISTS",
|
||||
"code": "invalid_search_filter",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_specific_fragment_user_provided() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.fragments.other.userProvided EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"message": "Index `[uuid]`: Vector filter cannot have both `other` and `userProvided`.\n31:43 _vectors.rest.fragments.other.userProvided EXISTS",
|
||||
"code": "invalid_search_filter",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_document_template_but_fragments_used() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.documentTemplate EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 0
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_document_template() {
|
||||
let (_mock, setting) = crate::vector::create_mock().await;
|
||||
let server = crate::vector::get_server_vector().await;
|
||||
let index = server.index("doggo");
|
||||
|
||||
let (_response, code) = server.set_features(json!({"multimodal": true})).await;
|
||||
snapshot!(code, @"200 OK");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": {
|
||||
"rest": setting,
|
||||
},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
server.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir"},
|
||||
{"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }},
|
||||
{"id": 2, "name": "intel"},
|
||||
{"id": 3, "name": "iko" }
|
||||
]);
|
||||
let (value, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(value.uid()).await.succeeded();
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.documentTemplate EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "kefir"
|
||||
},
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "iko"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 3
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_feature_gate() {
|
||||
let index = shared_index_with_documents().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"message": "using a vector filter requires enabling the `multimodal` experimental feature. See https://github.com/orgs/meilisearch/discussions/846\n1:9 _vectors EXISTS",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_negation() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.userProvided NOT EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "kefir"
|
||||
},
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "dustin"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 3
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_or_combination() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": "_vectors.rest.fragments.withBreed EXISTS OR _vectors.rest.userProvided EXISTS",
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "echo"
|
||||
},
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "dustin"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 3
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn vector_filter_regenerate() {
|
||||
let index = shared_index_for_fragments().await;
|
||||
|
||||
for selector in ["_vectors.rest.regenerate", "_vectors.rest.fragments.basic.regenerate"] {
|
||||
let (value, _code) = index
|
||||
.search_post(json!({
|
||||
"filter": format!("{selector} EXISTS"),
|
||||
"attributesToRetrieve": ["name"]
|
||||
}))
|
||||
.await;
|
||||
snapshot!(value, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name": "kefir"
|
||||
},
|
||||
{
|
||||
"name": "intel"
|
||||
},
|
||||
{
|
||||
"name": "dustin"
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 3
|
||||
}
|
||||
"#);
|
||||
}
|
||||
}
|
||||
|
@ -14,8 +14,9 @@ use meilisearch::option::MaxThreads;
|
||||
use crate::common::index::Index;
|
||||
use crate::common::{default_settings, GetAllDocumentsOptions, Server};
|
||||
use crate::json;
|
||||
pub use rest::create_mock;
|
||||
|
||||
async fn get_server_vector() -> Server {
|
||||
pub async fn get_server_vector() -> Server {
|
||||
Server::new().await
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@ use crate::common::Value;
|
||||
use crate::json;
|
||||
use crate::vector::{get_server_vector, GetAllDocumentsOptions};
|
||||
|
||||
async fn create_mock() -> (&'static MockServer, Value) {
|
||||
pub async fn create_mock() -> (&'static MockServer, Value) {
|
||||
let mock_server = Box::leak(Box::new(MockServer::start().await));
|
||||
|
||||
let text_to_embedding: BTreeMap<_, _> = vec![
|
||||
|
@ -40,7 +40,7 @@ indexmap = { version = "2.9.0", features = ["serde"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
memchr = "2.7.5"
|
||||
memmap2 = "0.9.7"
|
||||
memmap2 = "0.9.5"
|
||||
obkv = "0.3.0"
|
||||
once_cell = "1.21.3"
|
||||
ordered-float = "5.0.0"
|
||||
|
@ -111,7 +111,7 @@ impl FilterableAttributesFeatures {
|
||||
self.filter.is_filterable_null()
|
||||
}
|
||||
|
||||
/// Check if `IS EXISTS` is allowed
|
||||
/// Check if `EXISTS` is allowed
|
||||
pub fn is_filterable_exists(&self) -> bool {
|
||||
self.filter.is_filterable_exists()
|
||||
}
|
||||
|
@ -10,7 +10,7 @@ use memchr::memmem::Finder;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::facet_range_search;
|
||||
use super::{facet_range_search, filter_vector::VectorFilter};
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::error::{Error, UserError};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
@ -228,6 +228,10 @@ impl<'a> Filter<'a> {
|
||||
pub fn use_contains_operator(&self) -> Option<&Token> {
|
||||
self.condition.use_contains_operator()
|
||||
}
|
||||
|
||||
pub fn use_vector_filter(&self) -> Option<&Token> {
|
||||
self.condition.use_vector_filter()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Filter<'a> {
|
||||
@ -235,10 +239,12 @@ impl<'a> Filter<'a> {
|
||||
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?;
|
||||
|
||||
for fid in self.condition.fids(MAX_FILTER_DEPTH) {
|
||||
let attribute = fid.value();
|
||||
if matching_features(attribute, &filterable_attributes_rules)
|
||||
.is_some_and(|(_, features)| features.is_filterable())
|
||||
|| VectorFilter::matches(attribute)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@ -542,7 +548,18 @@ impl<'a> Filter<'a> {
|
||||
.union()
|
||||
}
|
||||
FilterCondition::Condition { fid, op } => {
|
||||
let Some(field_id) = field_ids_map.id(fid.value()) else {
|
||||
let value = fid.value();
|
||||
if VectorFilter::matches(value) {
|
||||
if !matches!(op, Condition::Exists) {
|
||||
return Err(Error::UserError(UserError::InvalidFilter(String::from(
|
||||
"Vector filter can only be used with the `exists` operator",
|
||||
))));
|
||||
}
|
||||
let vector_filter = VectorFilter::parse(fid)?;
|
||||
return vector_filter.evaluate(rtxn, index, universe);
|
||||
}
|
||||
|
||||
let Some(field_id) = field_ids_map.id(value) else {
|
||||
return Ok(RoaringBitmap::new());
|
||||
};
|
||||
let Some((rule_index, features)) =
|
||||
|
276
crates/milli/src/search/facet/filter_vector.rs
Normal file
276
crates/milli/src/search/facet/filter_vector.rs
Normal file
@ -0,0 +1,276 @@
|
||||
use filter_parser::Token;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use crate::error::{Error, UserError};
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::vector::{ArroyStats, ArroyWrapper};
|
||||
use crate::Index;
|
||||
|
||||
#[derive(Debug)]
|
||||
enum VectorFilterInner<'a> {
|
||||
Fragment { embedder_token: Token<'a>, fragment_token: Token<'a> },
|
||||
DocumentTemplate { embedder_token: Token<'a> },
|
||||
UserProvided { embedder_token: Token<'a> },
|
||||
FullEmbedder { embedder_token: Token<'a> },
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) struct VectorFilter<'a> {
|
||||
inner: Option<VectorFilterInner<'a>>,
|
||||
regenerate: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum VectorFilterError<'a> {
|
||||
#[error("Vector filter cannot be empty.")]
|
||||
EmptyFilter,
|
||||
|
||||
#[error("Vector filter must start with `_vectors` but found `{}`.", _0.value())]
|
||||
InvalidPrefix(Token<'a>),
|
||||
|
||||
#[error("Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")]
|
||||
MissingFragmentName(Token<'a>),
|
||||
|
||||
#[error("Vector filter cannot have both `{}` and `{}`.", _0.0.value(), _0.1.value())]
|
||||
ExclusiveOptions(Box<(Token<'a>, Token<'a>)>),
|
||||
|
||||
#[error("Vector filter has leftover token: `{}`.", _0.value())]
|
||||
LeftoverToken(Token<'a>),
|
||||
|
||||
#[error("The embedder `{}` does not exist. {}", embedder.value(), {
|
||||
if available.is_empty() {
|
||||
String::from("This index does not have any configured embedders.")
|
||||
} else {
|
||||
let mut available = available.clone();
|
||||
available.sort_unstable();
|
||||
format!("Available embedders are: {}.", available.iter().map(|e| format!("`{e}`")).collect::<Vec<_>>().join(", "))
|
||||
}
|
||||
})]
|
||||
EmbedderDoesNotExist { embedder: &'a Token<'a>, available: Vec<String> },
|
||||
|
||||
#[error("The fragment `{}` does not exist on embedder `{}`. {}", fragment.value(), embedder.value(), {
|
||||
if available.is_empty() {
|
||||
String::from("This embedder does not have any configured fragments.")
|
||||
} else {
|
||||
let mut available = available.clone();
|
||||
available.sort_unstable();
|
||||
format!("Available fragments on this embedder are: {}.", available.iter().map(|f| format!("`{f}`")).collect::<Vec<_>>().join(", "))
|
||||
}
|
||||
})]
|
||||
FragmentDoesNotExist {
|
||||
embedder: &'a Token<'a>,
|
||||
fragment: &'a Token<'a>,
|
||||
available: Vec<String>,
|
||||
},
|
||||
}
|
||||
|
||||
use VectorFilterError::*;
|
||||
|
||||
impl<'a> From<VectorFilterError<'a>> for Error {
|
||||
fn from(err: VectorFilterError<'a>) -> Self {
|
||||
match &err {
|
||||
EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())),
|
||||
InvalidPrefix(token) | MissingFragmentName(token) | LeftoverToken(token) => {
|
||||
token.clone().as_external_error(err).into()
|
||||
}
|
||||
ExclusiveOptions(tokens) => tokens.1.clone().as_external_error(err).into(),
|
||||
EmbedderDoesNotExist { embedder: token, .. }
|
||||
| FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> VectorFilter<'a> {
|
||||
pub(super) fn matches(value: &str) -> bool {
|
||||
value.starts_with("_vectors.") || value == "_vectors"
|
||||
}
|
||||
|
||||
/// Parses a vector filter string.
|
||||
///
|
||||
/// Valid formats:
|
||||
/// - `_vectors`
|
||||
/// - `_vectors.{embedder_name}`
|
||||
/// - `_vectors.{embedder_name}.regenerate`
|
||||
/// - `_vectors.{embedder_name}.userProvided`
|
||||
/// - `_vectors.{embedder_name}.userProvided.regenerate`
|
||||
/// - `_vectors.{embedder_name}.documentTemplate`
|
||||
/// - `_vectors.{embedder_name}.documentTemplate.regenerate`
|
||||
/// - `_vectors.{embedder_name}.fragments.{fragment_name}`
|
||||
/// - `_vectors.{embedder_name}.fragments.{fragment_name}.regenerate`
|
||||
pub(super) fn parse(s: &'a Token<'a>) -> Result<Self, VectorFilterError<'a>> {
|
||||
let mut split = s.split(".").peekable();
|
||||
|
||||
match split.next() {
|
||||
Some(token) if token.value() == "_vectors" => (),
|
||||
Some(token) => return Err(InvalidPrefix(token)),
|
||||
None => return Err(EmptyFilter),
|
||||
}
|
||||
|
||||
let embedder_name = split.next();
|
||||
|
||||
let mut fragment_name = None;
|
||||
if split.peek().map(|t| t.value()) == Some("fragments") {
|
||||
let token = split.next().expect("it was peeked before");
|
||||
|
||||
fragment_name = Some(split.next().ok_or(MissingFragmentName(token))?);
|
||||
}
|
||||
|
||||
let mut user_provided_token = None;
|
||||
if split.peek().map(|t| t.value()) == Some("userProvided") {
|
||||
user_provided_token = split.next();
|
||||
}
|
||||
|
||||
let mut document_template_token = None;
|
||||
if split.peek().map(|t| t.value()) == Some("documentTemplate") {
|
||||
document_template_token = split.next();
|
||||
}
|
||||
|
||||
let mut regenerate_token = None;
|
||||
if split.peek().map(|t| t.value()) == Some("regenerate") {
|
||||
regenerate_token = split.next();
|
||||
}
|
||||
|
||||
let inner = match (fragment_name, user_provided_token, document_template_token) {
|
||||
(Some(fragment_name), None, None) => Some(VectorFilterInner::Fragment {
|
||||
embedder_token: embedder_name
|
||||
.expect("embedder name comes before fragment so it's always Some"),
|
||||
fragment_token: fragment_name,
|
||||
}),
|
||||
(None, Some(_), None) => Some(VectorFilterInner::UserProvided {
|
||||
embedder_token: embedder_name
|
||||
.expect("embedder name comes before userProvided so it's always Some"),
|
||||
}),
|
||||
(None, None, Some(_)) => Some(VectorFilterInner::DocumentTemplate {
|
||||
embedder_token: embedder_name
|
||||
.expect("embedder name comes before documentTemplate so it's always Some"),
|
||||
}),
|
||||
(Some(a), Some(b), _) | (_, Some(a), Some(b)) | (Some(a), None, Some(b)) => {
|
||||
return Err(ExclusiveOptions(Box::new((a, b))));
|
||||
}
|
||||
(None, None, None) => embedder_name
|
||||
.map(|embedder_token| VectorFilterInner::FullEmbedder { embedder_token }),
|
||||
};
|
||||
|
||||
if let Some(next) = split.next() {
|
||||
return Err(LeftoverToken(next))?;
|
||||
}
|
||||
|
||||
Ok(Self { inner, regenerate: regenerate_token.is_some() })
|
||||
}
|
||||
|
||||
pub(super) fn evaluate(
|
||||
self,
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
index: &Index,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
) -> crate::Result<RoaringBitmap> {
|
||||
let index_embedding_configs = index.embedding_configs();
|
||||
let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?;
|
||||
|
||||
let inners = match self.inner {
|
||||
Some(inner) => vec![inner],
|
||||
None => embedding_configs
|
||||
.iter()
|
||||
.map(|config| VectorFilterInner::FullEmbedder {
|
||||
embedder_token: Token::from(config.name.as_str()),
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
let mut docids = inners
|
||||
.iter()
|
||||
.map(|i| i.evaluate_inner(rtxn, index, &embedding_configs, self.regenerate))
|
||||
.union()?;
|
||||
|
||||
if let Some(universe) = universe {
|
||||
docids &= universe;
|
||||
}
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
}
|
||||
|
||||
impl VectorFilterInner<'_> {
|
||||
fn evaluate_inner(
|
||||
&self,
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
index: &Index,
|
||||
embedding_configs: &[IndexEmbeddingConfig],
|
||||
regenerate: bool,
|
||||
) -> crate::Result<RoaringBitmap> {
|
||||
let embedder = match self {
|
||||
VectorFilterInner::Fragment { embedder_token, .. } => embedder_token,
|
||||
VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token,
|
||||
VectorFilterInner::UserProvided { embedder_token } => embedder_token,
|
||||
VectorFilterInner::FullEmbedder { embedder_token } => embedder_token,
|
||||
};
|
||||
let embedder_name = embedder.value();
|
||||
let available_embedders =
|
||||
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
|
||||
|
||||
let embedding_config = embedding_configs
|
||||
.iter()
|
||||
.find(|config| config.name == embedder_name)
|
||||
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
||||
|
||||
let embedder_info = index
|
||||
.embedding_configs()
|
||||
.embedder_info(rtxn, embedder_name)?
|
||||
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
||||
|
||||
let arroy_wrapper = ArroyWrapper::new(
|
||||
index.vector_arroy,
|
||||
embedder_info.embedder_id,
|
||||
embedding_config.config.quantized(),
|
||||
);
|
||||
|
||||
let mut docids = match self {
|
||||
VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => {
|
||||
let fragment_name = fragment.value();
|
||||
let fragment_config = embedding_config
|
||||
.fragments
|
||||
.as_slice()
|
||||
.iter()
|
||||
.find(|fragment| fragment.name == fragment_name)
|
||||
.ok_or_else(|| FragmentDoesNotExist {
|
||||
embedder,
|
||||
fragment,
|
||||
available: embedding_config
|
||||
.fragments
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|f| f.name.clone())
|
||||
.collect(),
|
||||
})?;
|
||||
|
||||
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?
|
||||
}
|
||||
VectorFilterInner::DocumentTemplate { .. } => {
|
||||
if !embedding_config.fragments.as_slice().is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
|
||||
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
|
||||
let mut stats = ArroyStats::default();
|
||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||
stats.documents - user_provided_docsids.clone()
|
||||
}
|
||||
VectorFilterInner::UserProvided { .. } => {
|
||||
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
|
||||
user_provided_docsids.clone()
|
||||
}
|
||||
VectorFilterInner::FullEmbedder { .. } => {
|
||||
let mut stats = ArroyStats::default();
|
||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||
stats.documents
|
||||
}
|
||||
};
|
||||
|
||||
if regenerate {
|
||||
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
|
||||
docids -= skip_regenerate;
|
||||
}
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
}
|
@ -17,6 +17,7 @@ mod facet_range_search;
|
||||
mod facet_sort_ascending;
|
||||
mod facet_sort_descending;
|
||||
mod filter;
|
||||
mod filter_vector;
|
||||
mod search;
|
||||
|
||||
fn facet_extreme_value<'t>(
|
||||
|
@ -19,7 +19,9 @@ use crate::update::{
|
||||
};
|
||||
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult};
|
||||
use crate::{
|
||||
db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult, UserError,
|
||||
};
|
||||
|
||||
pub(crate) struct TempIndex {
|
||||
pub inner: Index,
|
||||
@ -1341,8 +1343,8 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() {
|
||||
let results = search
|
||||
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
assert!(results.candidates.is_empty());
|
||||
.unwrap_err();
|
||||
assert!(matches!(results, Error::UserError(UserError::InvalidFilter(_))));
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
@ -1373,6 +1375,6 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() {
|
||||
let results = search
|
||||
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
assert!(results.candidates.is_empty());
|
||||
.unwrap_err();
|
||||
assert!(matches!(results, Error::UserError(UserError::InvalidFilter(_))));
|
||||
}
|
||||
|
@ -93,7 +93,7 @@ pub struct ChatSearchParams {
|
||||
pub hybrid: Setting<HybridQuery>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[deserr(default = Setting::Set(20))]
|
||||
#[schema(value_type = Option<usize>)]
|
||||
pub limit: Setting<usize>,
|
||||
|
||||
|
@ -16,7 +16,6 @@ pub struct IndexerConfig {
|
||||
pub max_positions_per_attributes: Option<u32>,
|
||||
pub skip_index_budget: bool,
|
||||
pub experimental_no_edition_2024_for_settings: bool,
|
||||
pub experimental_no_edition_2024_for_dumps: bool,
|
||||
}
|
||||
|
||||
impl IndexerConfig {
|
||||
@ -66,7 +65,6 @@ impl Default for IndexerConfig {
|
||||
max_positions_per_attributes: None,
|
||||
skip_index_budget: false,
|
||||
experimental_no_edition_2024_for_settings: false,
|
||||
experimental_no_edition_2024_for_dumps: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -128,6 +128,7 @@ impl EmbeddingStatus {
|
||||
pub fn is_user_provided(&self, docid: DocumentId) -> bool {
|
||||
self.user_provided.contains(docid)
|
||||
}
|
||||
|
||||
/// Whether vectors should be regenerated for that document and that embedder.
|
||||
pub fn must_regenerate(&self, docid: DocumentId) -> bool {
|
||||
let invert = self.skip_regenerate_different_from_user_provided.contains(docid);
|
||||
|
Reference in New Issue
Block a user