mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-20 05:20:36 +00:00
Compare commits
11 Commits
latest
...
option-dis
Author | SHA1 | Date | |
---|---|---|---|
af2b722fed | |||
8cb7001755 | |||
882663bf7f | |||
3234f63c00 | |||
9fff081043 | |||
575b7b7a0b | |||
6287f5b204 | |||
5dac8e7168 | |||
e669af1e49 | |||
0e0e29459c | |||
c25f7e3450 |
2
.github/workflows/flaky-tests.yml
vendored
2
.github/workflows/flaky-tests.yml
vendored
@ -1,4 +1,6 @@
|
|||||||
name: Look for flaky tests
|
name: Look for flaky tests
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
|
3
.github/workflows/fuzzer-indexing.yml
vendored
3
.github/workflows/fuzzer-indexing.yml
vendored
@ -1,5 +1,6 @@
|
|||||||
name: Run the indexing fuzzer
|
name: Run the indexing fuzzer
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
|
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@ -15,6 +15,8 @@ jobs:
|
|||||||
|
|
||||||
debian:
|
debian:
|
||||||
name: Publish debian packagge
|
name: Publish debian packagge
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: check-version
|
needs: check-version
|
||||||
container:
|
container:
|
||||||
|
4
.github/workflows/publish-binaries.yml
vendored
4
.github/workflows/publish-binaries.yml
vendored
@ -35,6 +35,8 @@ jobs:
|
|||||||
publish-linux:
|
publish-linux:
|
||||||
name: Publish binary for Linux
|
name: Publish binary for Linux
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
needs: check-version
|
needs: check-version
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||||
@ -132,6 +134,8 @@ jobs:
|
|||||||
name: Publish binary for aarch64
|
name: Publish binary for aarch64
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: check-version
|
needs: check-version
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
|
8
.github/workflows/test-suite.yml
vendored
8
.github/workflows/test-suite.yml
vendored
@ -21,6 +21,8 @@ jobs:
|
|||||||
test-linux:
|
test-linux:
|
||||||
name: Tests on ubuntu-18.04
|
name: Tests on ubuntu-18.04
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -77,6 +79,8 @@ jobs:
|
|||||||
test-all-features:
|
test-all-features:
|
||||||
name: Tests almost all features
|
name: Tests almost all features
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -100,6 +104,8 @@ jobs:
|
|||||||
|
|
||||||
test-disabled-tokenization:
|
test-disabled-tokenization:
|
||||||
name: Test disabled tokenization
|
name: Test disabled tokenization
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
container:
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -127,6 +133,8 @@ jobs:
|
|||||||
# We run tests in debug also, to make sure that the debug_assertions are hit
|
# We run tests in debug also, to make sure that the debug_assertions are hit
|
||||||
test-debug:
|
test-debug:
|
||||||
name: Run tests in debug
|
name: Run tests in debug
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
|
42
Cargo.lock
generated
42
Cargo.lock
generated
@ -503,7 +503,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "benchmarks"
|
name = "benchmarks"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bytes",
|
"bytes",
|
||||||
@ -648,7 +648,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "build-info"
|
name = "build-info"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"time",
|
"time",
|
||||||
@ -1579,7 +1579,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dump"
|
name = "dump"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"big_s",
|
"big_s",
|
||||||
@ -1804,7 +1804,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "file-store"
|
name = "file-store"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"faux",
|
"faux",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
@ -1827,7 +1827,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "filter-parser"
|
name = "filter-parser"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"nom",
|
"nom",
|
||||||
@ -1847,7 +1847,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flatten-serde-json"
|
name = "flatten-serde-json"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -1965,7 +1965,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fuzzers"
|
name = "fuzzers"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arbitrary",
|
"arbitrary",
|
||||||
"clap",
|
"clap",
|
||||||
@ -2452,7 +2452,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "index-scheduler"
|
name = "index-scheduler"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"arroy",
|
"arroy",
|
||||||
@ -2649,7 +2649,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "json-depth-checker"
|
name = "json-depth-checker"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -3257,7 +3257,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meili-snap"
|
name = "meili-snap"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"md5",
|
"md5",
|
||||||
@ -3266,7 +3266,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch"
|
name = "meilisearch"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-cors",
|
"actix-cors",
|
||||||
"actix-http",
|
"actix-http",
|
||||||
@ -3358,7 +3358,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-auth"
|
name = "meilisearch-auth"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.21.7",
|
"base64 0.21.7",
|
||||||
"enum-iterator",
|
"enum-iterator",
|
||||||
@ -3377,7 +3377,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-types"
|
name = "meilisearch-types"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@ -3407,7 +3407,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilitool"
|
name = "meilitool"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"clap",
|
"clap",
|
||||||
@ -3446,7 +3446,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "milli"
|
name = "milli"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arroy",
|
"arroy",
|
||||||
"big_s",
|
"big_s",
|
||||||
@ -3886,7 +3886,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "permissive-json-pointer"
|
name = "permissive-json-pointer"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"big_s",
|
"big_s",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -5098,9 +5098,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.3.34"
|
version = "0.3.36"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
|
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deranged",
|
"deranged",
|
||||||
"itoa",
|
"itoa",
|
||||||
@ -5121,9 +5121,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time-macros"
|
name = "time-macros"
|
||||||
version = "0.2.17"
|
version = "0.2.18"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774"
|
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"num-conv",
|
"num-conv",
|
||||||
"time-core",
|
"time-core",
|
||||||
@ -6042,7 +6042,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xtask"
|
name = "xtask"
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"build-info",
|
"build-info",
|
||||||
|
@ -22,7 +22,7 @@ members = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "1.9.0"
|
version = "1.9.1"
|
||||||
authors = [
|
authors = [
|
||||||
"Quentin de Quelen <quentin@dequelen.me>",
|
"Quentin de Quelen <quentin@dequelen.me>",
|
||||||
"Clément Renault <clement@meilisearch.com>",
|
"Clément Renault <clement@meilisearch.com>",
|
||||||
|
@ -897,91 +897,95 @@ impl IndexScheduler {
|
|||||||
dump_tasks.flush()?;
|
dump_tasks.flush()?;
|
||||||
|
|
||||||
// 3. Dump the indexes
|
// 3. Dump the indexes
|
||||||
self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> {
|
let () =
|
||||||
let rtxn = index.read_txn()?;
|
self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> {
|
||||||
let metadata = IndexMetadata {
|
let rtxn = index.read_txn()?;
|
||||||
uid: uid.to_owned(),
|
let metadata = IndexMetadata {
|
||||||
primary_key: index.primary_key(&rtxn)?.map(String::from),
|
uid: uid.to_owned(),
|
||||||
created_at: index.created_at(&rtxn)?,
|
primary_key: index.primary_key(&rtxn)?.map(String::from),
|
||||||
updated_at: index.updated_at(&rtxn)?,
|
created_at: index.created_at(&rtxn)?,
|
||||||
};
|
updated_at: index.updated_at(&rtxn)?,
|
||||||
let mut index_dumper = dump.create_index(uid, &metadata)?;
|
};
|
||||||
|
let mut index_dumper = dump.create_index(uid, &metadata)?;
|
||||||
|
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||||
let embedding_configs = index.embedding_configs(&rtxn)?;
|
let embedding_configs = index.embedding_configs(&rtxn)?;
|
||||||
|
|
||||||
// 3.1. Dump the documents
|
// 3.1. Dump the documents
|
||||||
for ret in index.all_documents(&rtxn)? {
|
for ret in index.all_documents(&rtxn)? {
|
||||||
if self.must_stop_processing.get() {
|
if self.must_stop_processing.get() {
|
||||||
return Err(Error::AbortedTask);
|
return Err(Error::AbortedTask);
|
||||||
}
|
|
||||||
|
|
||||||
let (id, doc) = ret?;
|
|
||||||
|
|
||||||
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
|
||||||
|
|
||||||
'inject_vectors: {
|
|
||||||
let embeddings = index.embeddings(&rtxn, id)?;
|
|
||||||
|
|
||||||
if embeddings.is_empty() {
|
|
||||||
break 'inject_vectors;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let vectors = document
|
let (id, doc) = ret?;
|
||||||
.entry(RESERVED_VECTORS_FIELD_NAME.to_owned())
|
|
||||||
.or_insert(serde_json::Value::Object(Default::default()));
|
|
||||||
|
|
||||||
let serde_json::Value::Object(vectors) = vectors else {
|
let mut document =
|
||||||
return Err(milli::Error::UserError(
|
milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||||
milli::UserError::InvalidVectorsMapType {
|
|
||||||
document_id: {
|
'inject_vectors: {
|
||||||
if let Ok(Some(Ok(index))) = index
|
let embeddings = index.embeddings(&rtxn, id)?;
|
||||||
.external_id_of(&rtxn, std::iter::once(id))
|
|
||||||
.map(|it| it.into_iter().next())
|
if embeddings.is_empty() {
|
||||||
{
|
break 'inject_vectors;
|
||||||
index
|
}
|
||||||
} else {
|
|
||||||
format!("internal docid={id}")
|
let vectors = document
|
||||||
}
|
.entry(RESERVED_VECTORS_FIELD_NAME.to_owned())
|
||||||
|
.or_insert(serde_json::Value::Object(Default::default()));
|
||||||
|
|
||||||
|
let serde_json::Value::Object(vectors) = vectors else {
|
||||||
|
return Err(milli::Error::UserError(
|
||||||
|
milli::UserError::InvalidVectorsMapType {
|
||||||
|
document_id: {
|
||||||
|
if let Ok(Some(Ok(index))) = index
|
||||||
|
.external_id_of(&rtxn, std::iter::once(id))
|
||||||
|
.map(|it| it.into_iter().next())
|
||||||
|
{
|
||||||
|
index
|
||||||
|
} else {
|
||||||
|
format!("internal docid={id}")
|
||||||
|
}
|
||||||
|
},
|
||||||
|
value: vectors.clone(),
|
||||||
},
|
},
|
||||||
value: vectors.clone(),
|
)
|
||||||
},
|
.into());
|
||||||
)
|
|
||||||
.into());
|
|
||||||
};
|
|
||||||
|
|
||||||
for (embedder_name, embeddings) in embeddings {
|
|
||||||
let user_provided = embedding_configs
|
|
||||||
.iter()
|
|
||||||
.find(|conf| conf.name == embedder_name)
|
|
||||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
|
||||||
|
|
||||||
let embeddings = ExplicitVectors {
|
|
||||||
embeddings: Some(
|
|
||||||
VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
|
|
||||||
),
|
|
||||||
regenerate: !user_provided,
|
|
||||||
};
|
};
|
||||||
vectors.insert(
|
|
||||||
embedder_name,
|
for (embedder_name, embeddings) in embeddings {
|
||||||
serde_json::to_value(embeddings).unwrap(),
|
let user_provided = embedding_configs
|
||||||
);
|
.iter()
|
||||||
|
.find(|conf| conf.name == embedder_name)
|
||||||
|
.is_some_and(|conf| conf.user_provided.contains(id));
|
||||||
|
|
||||||
|
let embeddings = ExplicitVectors {
|
||||||
|
embeddings: Some(
|
||||||
|
VectorOrArrayOfVectors::from_array_of_vectors(
|
||||||
|
embeddings,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
regenerate: !user_provided,
|
||||||
|
};
|
||||||
|
vectors.insert(
|
||||||
|
embedder_name,
|
||||||
|
serde_json::to_value(embeddings).unwrap(),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
index_dumper.push_document(&document)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
index_dumper.push_document(&document)?;
|
// 3.2. Dump the settings
|
||||||
}
|
let settings = meilisearch_types::settings::settings(
|
||||||
|
index,
|
||||||
// 3.2. Dump the settings
|
&rtxn,
|
||||||
let settings = meilisearch_types::settings::settings(
|
meilisearch_types::settings::SecretPolicy::RevealSecrets,
|
||||||
index,
|
)?;
|
||||||
&rtxn,
|
index_dumper.settings(&settings)?;
|
||||||
meilisearch_types::settings::SecretPolicy::RevealSecrets,
|
Ok(())
|
||||||
)?;
|
})?;
|
||||||
index_dumper.settings(&settings)?;
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// 4. Dump experimental feature settings
|
// 4. Dump experimental feature settings
|
||||||
let features = self.features().runtime_features();
|
let features = self.features().runtime_features();
|
||||||
@ -1288,7 +1292,11 @@ impl IndexScheduler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
|
let config = IndexDocumentsConfig {
|
||||||
|
update_method: method,
|
||||||
|
compute_prefix_databases: self.compute_prefix_databases,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
let embedder_configs = index.embedding_configs(index_wtxn)?;
|
let embedder_configs = index.embedding_configs(index_wtxn)?;
|
||||||
// TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
|
// TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
|
||||||
@ -1398,6 +1406,7 @@ impl IndexScheduler {
|
|||||||
let deleted_documents = delete_document_by_filter(
|
let deleted_documents = delete_document_by_filter(
|
||||||
index_wtxn,
|
index_wtxn,
|
||||||
filter,
|
filter,
|
||||||
|
self.compute_prefix_databases,
|
||||||
self.index_mapper.indexer_config(),
|
self.index_mapper.indexer_config(),
|
||||||
self.must_stop_processing.clone(),
|
self.must_stop_processing.clone(),
|
||||||
index,
|
index,
|
||||||
@ -1638,6 +1647,7 @@ impl IndexScheduler {
|
|||||||
fn delete_document_by_filter<'a>(
|
fn delete_document_by_filter<'a>(
|
||||||
wtxn: &mut RwTxn<'a>,
|
wtxn: &mut RwTxn<'a>,
|
||||||
filter: &serde_json::Value,
|
filter: &serde_json::Value,
|
||||||
|
compute_prefix_databases: bool,
|
||||||
indexer_config: &IndexerConfig,
|
indexer_config: &IndexerConfig,
|
||||||
must_stop_processing: MustStopProcessing,
|
must_stop_processing: MustStopProcessing,
|
||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
@ -1653,6 +1663,7 @@ fn delete_document_by_filter<'a>(
|
|||||||
|
|
||||||
let config = IndexDocumentsConfig {
|
let config = IndexDocumentsConfig {
|
||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
|
compute_prefix_databases,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
|||||||
features: _,
|
features: _,
|
||||||
max_number_of_tasks: _,
|
max_number_of_tasks: _,
|
||||||
max_number_of_batched_tasks: _,
|
max_number_of_batched_tasks: _,
|
||||||
|
compute_prefix_databases: _,
|
||||||
wake_up: _,
|
wake_up: _,
|
||||||
dumps_path: _,
|
dumps_path: _,
|
||||||
snapshots_path: _,
|
snapshots_path: _,
|
||||||
|
@ -276,6 +276,8 @@ pub struct IndexSchedulerOptions {
|
|||||||
pub max_number_of_batched_tasks: usize,
|
pub max_number_of_batched_tasks: usize,
|
||||||
/// The experimental features enabled for this instance.
|
/// The experimental features enabled for this instance.
|
||||||
pub instance_features: InstanceTogglableFeatures,
|
pub instance_features: InstanceTogglableFeatures,
|
||||||
|
/// An experimental option to control the generation of prefix databases.
|
||||||
|
pub compute_prefix_databases: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Structure which holds meilisearch's indexes and schedules the tasks
|
/// Structure which holds meilisearch's indexes and schedules the tasks
|
||||||
@ -283,19 +285,13 @@ pub struct IndexSchedulerOptions {
|
|||||||
pub struct IndexScheduler {
|
pub struct IndexScheduler {
|
||||||
/// The LMDB environment which the DBs are associated with.
|
/// The LMDB environment which the DBs are associated with.
|
||||||
pub(crate) env: Env,
|
pub(crate) env: Env,
|
||||||
|
|
||||||
/// A boolean that can be set to true to stop the currently processing tasks.
|
/// A boolean that can be set to true to stop the currently processing tasks.
|
||||||
pub(crate) must_stop_processing: MustStopProcessing,
|
pub(crate) must_stop_processing: MustStopProcessing,
|
||||||
|
|
||||||
/// The list of tasks currently processing
|
/// The list of tasks currently processing
|
||||||
pub(crate) processing_tasks: Arc<RwLock<ProcessingTasks>>,
|
pub(crate) processing_tasks: Arc<RwLock<ProcessingTasks>>,
|
||||||
|
|
||||||
/// The list of files referenced by the tasks
|
/// The list of files referenced by the tasks
|
||||||
pub(crate) file_store: FileStore,
|
pub(crate) file_store: FileStore, // The main database, it contains all the tasks accessible by their Id.
|
||||||
|
|
||||||
// The main database, it contains all the tasks accessible by their Id.
|
|
||||||
pub(crate) all_tasks: Database<BEU32, SerdeJson<Task>>,
|
pub(crate) all_tasks: Database<BEU32, SerdeJson<Task>>,
|
||||||
|
|
||||||
/// All the tasks ids grouped by their status.
|
/// All the tasks ids grouped by their status.
|
||||||
// TODO we should not be able to serialize a `Status::Processing` in this database.
|
// TODO we should not be able to serialize a `Status::Processing` in this database.
|
||||||
pub(crate) status: Database<SerdeBincode<Status>, RoaringBitmapCodec>,
|
pub(crate) status: Database<SerdeBincode<Status>, RoaringBitmapCodec>,
|
||||||
@ -303,58 +299,43 @@ pub struct IndexScheduler {
|
|||||||
pub(crate) kind: Database<SerdeBincode<Kind>, RoaringBitmapCodec>,
|
pub(crate) kind: Database<SerdeBincode<Kind>, RoaringBitmapCodec>,
|
||||||
/// Store the tasks associated to an index.
|
/// Store the tasks associated to an index.
|
||||||
pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
|
pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Store the tasks that were canceled by a task uid
|
/// Store the tasks that were canceled by a task uid
|
||||||
pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
|
pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Store the task ids of tasks which were enqueued at a specific date
|
/// Store the task ids of tasks which were enqueued at a specific date
|
||||||
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Store the task ids of finished tasks which started being processed at a specific date
|
/// Store the task ids of finished tasks which started being processed at a specific date
|
||||||
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Store the task ids of tasks which finished at a specific date
|
/// Store the task ids of tasks which finished at a specific date
|
||||||
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// In charge of creating, opening, storing and returning indexes.
|
/// In charge of creating, opening, storing and returning indexes.
|
||||||
pub(crate) index_mapper: IndexMapper,
|
pub(crate) index_mapper: IndexMapper,
|
||||||
|
|
||||||
/// In charge of fetching and setting the status of experimental features.
|
/// In charge of fetching and setting the status of experimental features.
|
||||||
features: features::FeatureData,
|
features: features::FeatureData,
|
||||||
|
|
||||||
/// Get a signal when a batch needs to be processed.
|
/// Get a signal when a batch needs to be processed.
|
||||||
pub(crate) wake_up: Arc<SignalEvent>,
|
pub(crate) wake_up: Arc<SignalEvent>,
|
||||||
|
|
||||||
/// Whether auto-batching is enabled or not.
|
/// Whether auto-batching is enabled or not.
|
||||||
pub(crate) autobatching_enabled: bool,
|
pub(crate) autobatching_enabled: bool,
|
||||||
|
|
||||||
/// Whether we should automatically cleanup the task queue or not.
|
/// Whether we should automatically cleanup the task queue or not.
|
||||||
pub(crate) cleanup_enabled: bool,
|
pub(crate) cleanup_enabled: bool,
|
||||||
|
|
||||||
/// The max number of tasks allowed before the scheduler starts to delete
|
/// The max number of tasks allowed before the scheduler starts to delete
|
||||||
/// the finished tasks automatically.
|
/// the finished tasks automatically.
|
||||||
pub(crate) max_number_of_tasks: usize,
|
pub(crate) max_number_of_tasks: usize,
|
||||||
|
|
||||||
/// The maximum number of tasks that will be batched together.
|
/// The maximum number of tasks that will be batched together.
|
||||||
pub(crate) max_number_of_batched_tasks: usize,
|
pub(crate) max_number_of_batched_tasks: usize,
|
||||||
|
/// Control wether we must generate the prefix databases or not.
|
||||||
|
pub(crate) compute_prefix_databases: bool,
|
||||||
/// The webhook url we should send tasks to after processing every batches.
|
/// The webhook url we should send tasks to after processing every batches.
|
||||||
pub(crate) webhook_url: Option<String>,
|
pub(crate) webhook_url: Option<String>,
|
||||||
/// The Authorization header to send to the webhook URL.
|
/// The Authorization header to send to the webhook URL.
|
||||||
pub(crate) webhook_authorization_header: Option<String>,
|
pub(crate) webhook_authorization_header: Option<String>,
|
||||||
|
|
||||||
/// The path used to create the dumps.
|
/// The path used to create the dumps.
|
||||||
pub(crate) dumps_path: PathBuf,
|
pub(crate) dumps_path: PathBuf,
|
||||||
|
|
||||||
/// The path used to create the snapshots.
|
/// The path used to create the snapshots.
|
||||||
pub(crate) snapshots_path: PathBuf,
|
pub(crate) snapshots_path: PathBuf,
|
||||||
|
|
||||||
/// The path to the folder containing the auth LMDB env.
|
/// The path to the folder containing the auth LMDB env.
|
||||||
pub(crate) auth_path: PathBuf,
|
pub(crate) auth_path: PathBuf,
|
||||||
|
|
||||||
/// The path to the version file of Meilisearch.
|
/// The path to the version file of Meilisearch.
|
||||||
pub(crate) version_file_path: PathBuf,
|
pub(crate) version_file_path: PathBuf,
|
||||||
|
|
||||||
embedders: Arc<RwLock<HashMap<EmbedderOptions, Arc<Embedder>>>>,
|
embedders: Arc<RwLock<HashMap<EmbedderOptions, Arc<Embedder>>>>,
|
||||||
|
|
||||||
// ================= test
|
// ================= test
|
||||||
@ -364,13 +345,11 @@ pub struct IndexScheduler {
|
|||||||
/// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation.
|
/// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>,
|
test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>,
|
||||||
|
|
||||||
/// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler.
|
/// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler.
|
||||||
///
|
///
|
||||||
/// The first field is the iteration index and the second field identifies a location in the code.
|
/// The first field is the iteration index and the second field identifies a location in the code.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
planned_failures: Vec<(usize, tests::FailureLocation)>,
|
planned_failures: Vec<(usize, tests::FailureLocation)>,
|
||||||
|
|
||||||
/// A counter that is incremented before every call to [`tick`](IndexScheduler::tick)
|
/// A counter that is incremented before every call to [`tick`](IndexScheduler::tick)
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
run_loop_iteration: Arc<RwLock<usize>>,
|
run_loop_iteration: Arc<RwLock<usize>>,
|
||||||
@ -397,6 +376,7 @@ impl IndexScheduler {
|
|||||||
cleanup_enabled: self.cleanup_enabled,
|
cleanup_enabled: self.cleanup_enabled,
|
||||||
max_number_of_tasks: self.max_number_of_tasks,
|
max_number_of_tasks: self.max_number_of_tasks,
|
||||||
max_number_of_batched_tasks: self.max_number_of_batched_tasks,
|
max_number_of_batched_tasks: self.max_number_of_batched_tasks,
|
||||||
|
compute_prefix_databases: self.compute_prefix_databases,
|
||||||
snapshots_path: self.snapshots_path.clone(),
|
snapshots_path: self.snapshots_path.clone(),
|
||||||
dumps_path: self.dumps_path.clone(),
|
dumps_path: self.dumps_path.clone(),
|
||||||
auth_path: self.auth_path.clone(),
|
auth_path: self.auth_path.clone(),
|
||||||
@ -499,6 +479,7 @@ impl IndexScheduler {
|
|||||||
cleanup_enabled: options.cleanup_enabled,
|
cleanup_enabled: options.cleanup_enabled,
|
||||||
max_number_of_tasks: options.max_number_of_tasks,
|
max_number_of_tasks: options.max_number_of_tasks,
|
||||||
max_number_of_batched_tasks: options.max_number_of_batched_tasks,
|
max_number_of_batched_tasks: options.max_number_of_batched_tasks,
|
||||||
|
compute_prefix_databases: options.compute_prefix_databases,
|
||||||
dumps_path: options.dumps_path,
|
dumps_path: options.dumps_path,
|
||||||
snapshots_path: options.snapshots_path,
|
snapshots_path: options.snapshots_path,
|
||||||
auth_path: options.auth_path,
|
auth_path: options.auth_path,
|
||||||
@ -1819,6 +1800,7 @@ mod tests {
|
|||||||
max_number_of_tasks: 1_000_000,
|
max_number_of_tasks: 1_000_000,
|
||||||
max_number_of_batched_tasks: usize::MAX,
|
max_number_of_batched_tasks: usize::MAX,
|
||||||
instance_features: Default::default(),
|
instance_features: Default::default(),
|
||||||
|
compute_prefix_databases: true,
|
||||||
};
|
};
|
||||||
configuration(&mut options);
|
configuration(&mut options);
|
||||||
|
|
||||||
|
@ -256,6 +256,7 @@ struct Infos {
|
|||||||
experimental_enable_logs_route: bool,
|
experimental_enable_logs_route: bool,
|
||||||
experimental_reduce_indexing_memory_usage: bool,
|
experimental_reduce_indexing_memory_usage: bool,
|
||||||
experimental_max_number_of_batched_tasks: usize,
|
experimental_max_number_of_batched_tasks: usize,
|
||||||
|
experimental_disable_prefix_db: bool,
|
||||||
gpu_enabled: bool,
|
gpu_enabled: bool,
|
||||||
db_path: bool,
|
db_path: bool,
|
||||||
import_dump: bool,
|
import_dump: bool,
|
||||||
@ -298,6 +299,7 @@ impl From<Opt> for Infos {
|
|||||||
experimental_enable_logs_route,
|
experimental_enable_logs_route,
|
||||||
experimental_reduce_indexing_memory_usage,
|
experimental_reduce_indexing_memory_usage,
|
||||||
experimental_max_number_of_batched_tasks,
|
experimental_max_number_of_batched_tasks,
|
||||||
|
experimental_disable_prefix_db,
|
||||||
http_addr,
|
http_addr,
|
||||||
master_key: _,
|
master_key: _,
|
||||||
env,
|
env,
|
||||||
@ -347,6 +349,7 @@ impl From<Opt> for Infos {
|
|||||||
experimental_replication_parameters,
|
experimental_replication_parameters,
|
||||||
experimental_enable_logs_route,
|
experimental_enable_logs_route,
|
||||||
experimental_reduce_indexing_memory_usage,
|
experimental_reduce_indexing_memory_usage,
|
||||||
|
experimental_disable_prefix_db,
|
||||||
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
|
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
|
||||||
db_path: db_path != PathBuf::from("./data.ms"),
|
db_path: db_path != PathBuf::from("./data.ms"),
|
||||||
import_dump: import_dump.is_some(),
|
import_dump: import_dump.is_some(),
|
||||||
|
@ -311,6 +311,7 @@ fn open_or_create_database_unchecked(
|
|||||||
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
|
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
|
||||||
index_count: DEFAULT_INDEX_COUNT,
|
index_count: DEFAULT_INDEX_COUNT,
|
||||||
instance_features,
|
instance_features,
|
||||||
|
compute_prefix_databases: !opt.experimental_disable_prefix_db,
|
||||||
})?)
|
})?)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -60,6 +60,7 @@ const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
|
|||||||
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
|
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
|
||||||
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
|
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
|
||||||
"MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS";
|
"MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS";
|
||||||
|
const MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB: &str = "MEILI_EXPERIMENTAL_DISABLE_PREFIXDB";
|
||||||
|
|
||||||
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
|
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
|
||||||
const DEFAULT_DB_PATH: &str = "./data.ms";
|
const DEFAULT_DB_PATH: &str = "./data.ms";
|
||||||
@ -389,6 +390,11 @@ pub struct Opt {
|
|||||||
#[serde(default = "default_limit_batched_tasks")]
|
#[serde(default = "default_limit_batched_tasks")]
|
||||||
pub experimental_max_number_of_batched_tasks: usize,
|
pub experimental_max_number_of_batched_tasks: usize,
|
||||||
|
|
||||||
|
/// Experimentally disable the prefix database, see: <https://github.com/orgs/meilisearch/discussions>
|
||||||
|
#[clap(long, env = MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB)]
|
||||||
|
#[serde(default)]
|
||||||
|
pub experimental_disable_prefix_db: bool,
|
||||||
|
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
#[clap(flatten)]
|
#[clap(flatten)]
|
||||||
pub indexer_options: IndexerOpts,
|
pub indexer_options: IndexerOpts,
|
||||||
@ -489,6 +495,7 @@ impl Opt {
|
|||||||
experimental_enable_logs_route,
|
experimental_enable_logs_route,
|
||||||
experimental_replication_parameters,
|
experimental_replication_parameters,
|
||||||
experimental_reduce_indexing_memory_usage,
|
experimental_reduce_indexing_memory_usage,
|
||||||
|
experimental_disable_prefix_db,
|
||||||
} = self;
|
} = self;
|
||||||
export_to_env_if_not_present(MEILI_DB_PATH, db_path);
|
export_to_env_if_not_present(MEILI_DB_PATH, db_path);
|
||||||
export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
|
export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
|
||||||
@ -518,6 +525,10 @@ impl Opt {
|
|||||||
MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
|
MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
|
||||||
experimental_max_number_of_batched_tasks.to_string(),
|
experimental_max_number_of_batched_tasks.to_string(),
|
||||||
);
|
);
|
||||||
|
export_to_env_if_not_present(
|
||||||
|
MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB,
|
||||||
|
experimental_disable_prefix_db.to_string(),
|
||||||
|
);
|
||||||
if let Some(ssl_cert_path) = ssl_cert_path {
|
if let Some(ssl_cert_path) = ssl_cert_path {
|
||||||
export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path);
|
export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path);
|
||||||
}
|
}
|
||||||
|
@ -644,7 +644,12 @@ async fn get_document_with_vectors() {
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"name": "echo",
|
"name": "echo",
|
||||||
"_vectors": {}
|
"_vectors": {
|
||||||
|
"manual": {
|
||||||
|
"embeddings": [],
|
||||||
|
"regenerate": false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"offset": 0,
|
"offset": 0,
|
||||||
@ -700,7 +705,12 @@ async fn get_document_with_vectors() {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "echo",
|
"name": "echo",
|
||||||
"_vectors": {}
|
"_vectors": {
|
||||||
|
"manual": {
|
||||||
|
"embeddings": [],
|
||||||
|
"regenerate": false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"offset": 0,
|
"offset": 0,
|
||||||
|
@ -119,7 +119,12 @@ async fn add_remove_user_provided() {
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"name": "echo",
|
"name": "echo",
|
||||||
"_vectors": {}
|
"_vectors": {
|
||||||
|
"manual": {
|
||||||
|
"embeddings": [],
|
||||||
|
"regenerate": false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"offset": 0,
|
"offset": 0,
|
||||||
@ -141,7 +146,12 @@ async fn add_remove_user_provided() {
|
|||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"name": "echo",
|
"name": "echo",
|
||||||
"_vectors": {}
|
"_vectors": {
|
||||||
|
"manual": {
|
||||||
|
"embeddings": [],
|
||||||
|
"regenerate": false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"offset": 0,
|
"offset": 0,
|
||||||
@ -577,7 +587,12 @@ async fn add_remove_one_vector_4588() {
|
|||||||
{
|
{
|
||||||
"id": 0,
|
"id": 0,
|
||||||
"name": "kefir",
|
"name": "kefir",
|
||||||
"_vectors": {}
|
"_vectors": {
|
||||||
|
"manual": {
|
||||||
|
"embeddings": [],
|
||||||
|
"regenerate": false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"offset": 0,
|
"offset": 0,
|
||||||
|
@ -141,3 +141,6 @@ swedish-recomposition = ["charabia/swedish-recomposition"]
|
|||||||
|
|
||||||
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
||||||
cuda = ["candle-core/cuda"]
|
cuda = ["candle-core/cuda"]
|
||||||
|
|
||||||
|
[lints.rust]
|
||||||
|
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] }
|
||||||
|
@ -1230,6 +1230,11 @@ impl Index {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Deletes the FST which is the words prefixes dictionary of the engine.
|
||||||
|
pub fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||||
|
self.main.remap_key_type::<Str>().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY)
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the FST which is the words prefixes dictionary of the engine.
|
/// Returns the FST which is the words prefixes dictionary of the engine.
|
||||||
pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
|
pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
|
||||||
match self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? {
|
match self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? {
|
||||||
@ -1660,9 +1665,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !embeddings.is_empty() {
|
res.insert(embedder_name.to_owned(), embeddings);
|
||||||
res.insert(embedder_name.to_owned(), embeddings);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
@ -85,7 +85,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
|
|||||||
embedders: EmbeddingConfigs,
|
embedders: EmbeddingConfigs,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct IndexDocumentsConfig {
|
pub struct IndexDocumentsConfig {
|
||||||
pub words_prefix_threshold: Option<u32>,
|
pub words_prefix_threshold: Option<u32>,
|
||||||
pub max_prefix_length: Option<usize>,
|
pub max_prefix_length: Option<usize>,
|
||||||
@ -93,6 +93,21 @@ pub struct IndexDocumentsConfig {
|
|||||||
pub words_positions_min_level_size: Option<NonZeroU32>,
|
pub words_positions_min_level_size: Option<NonZeroU32>,
|
||||||
pub update_method: IndexDocumentsMethod,
|
pub update_method: IndexDocumentsMethod,
|
||||||
pub autogenerate_docids: bool,
|
pub autogenerate_docids: bool,
|
||||||
|
pub compute_prefix_databases: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for IndexDocumentsConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
words_prefix_threshold: Default::default(),
|
||||||
|
max_prefix_length: Default::default(),
|
||||||
|
words_positions_level_group_size: Default::default(),
|
||||||
|
words_positions_min_level_size: Default::default(),
|
||||||
|
update_method: Default::default(),
|
||||||
|
autogenerate_docids: Default::default(),
|
||||||
|
compute_prefix_databases: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
|
impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
|
||||||
@ -558,12 +573,20 @@ where
|
|||||||
.map_err(InternalError::from)??;
|
.map_err(InternalError::from)??;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.execute_prefix_databases(
|
if self.config.compute_prefix_databases {
|
||||||
word_docids.map(MergerBuilder::build),
|
self.execute_prefix_databases(
|
||||||
exact_word_docids.map(MergerBuilder::build),
|
word_docids.map(MergerBuilder::build),
|
||||||
word_position_docids.map(MergerBuilder::build),
|
exact_word_docids.map(MergerBuilder::build),
|
||||||
word_fid_docids.map(MergerBuilder::build),
|
word_position_docids.map(MergerBuilder::build),
|
||||||
)?;
|
word_fid_docids.map(MergerBuilder::build),
|
||||||
|
)?;
|
||||||
|
} else {
|
||||||
|
self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
|
self.index.word_prefix_docids.clear(self.wtxn)?;
|
||||||
|
self.index.exact_word_prefix_docids.clear(self.wtxn)?;
|
||||||
|
self.index.word_prefix_position_docids.clear(self.wtxn)?;
|
||||||
|
self.index.word_prefix_fid_docids.clear(self.wtxn)?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(number_of_documents)
|
Ok(number_of_documents)
|
||||||
}
|
}
|
||||||
@ -2180,33 +2203,6 @@ mod tests {
|
|||||||
index.add_documents(doc1).unwrap();
|
index.add_documents(doc1).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "default")]
|
|
||||||
#[test]
|
|
||||||
fn store_detected_script_and_language_per_document_during_indexing() {
|
|
||||||
use charabia::{Language, Script};
|
|
||||||
let index = TempIndex::new();
|
|
||||||
index
|
|
||||||
.add_documents(documents!([
|
|
||||||
{ "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
|
||||||
{ "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
|
|
||||||
{ "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
|
|
||||||
{ "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
|
|
||||||
{ "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
|
|
||||||
{ "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
|
|
||||||
]))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
|
||||||
let key_jpn = (Script::Cj, Language::Jpn);
|
|
||||||
let key_cmn = (Script::Cj, Language::Cmn);
|
|
||||||
let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
|
|
||||||
let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
|
|
||||||
let expected_cj_jpn_docids = [3].iter().collect();
|
|
||||||
assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
|
|
||||||
let expected_cj_cmn_docids = [1, 5].iter().collect();
|
|
||||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn add_and_delete_documents_in_single_transform() {
|
fn add_and_delete_documents_in_single_transform() {
|
||||||
let mut index = TempIndex::new();
|
let mut index = TempIndex::new();
|
||||||
|
Reference in New Issue
Block a user