Compare commits

...

31 Commits

Author SHA1 Message Date
Clément Renault
63845ad211 Merge pull request #6024 from meilisearch/fix-metrics-route
Limit the number of retrieved task to one in the metrics
2025-12-03 14:18:49 +01:00
Kerollmops
4fad5e5d42 Limit the number of retrieved task to one 2025-12-03 14:18:09 +01:00
Kerollmops
726eae5e97 Fix non-detected searchable attribute 2025-12-03 14:18:09 +01:00
Kerollmops
2d85baa960 Introduce a test for when a new nested field becomes searchable 2025-12-03 14:18:09 +01:00
Kerollmops
fe2577f0dc Clean up the CI 2025-12-03 14:18:09 +01:00
Clément Renault
3c0313626d Fix a bug and improve code quality
Co-authored-by: Many the fish <many@meilisearch.com>
2025-12-03 14:18:09 +01:00
Kerollmops
8cd1c82ebf Update the proximity precision for the settings delta 2025-12-03 14:18:09 +01:00
Clément Renault
fdef327abb Show available space 2025-12-03 14:18:09 +01:00
Clément Renault
979cae3221 Trigger the new settings indexer when changing the proximity precision 2025-12-03 14:18:09 +01:00
Clément Renault
da802ab9e4 Clear old word prefix fid docids entries when removing searchable fields 2025-12-03 14:18:09 +01:00
Clément Renault
e754d1b514 Introduce the word pair proximity extractor 2025-12-03 14:18:08 +01:00
Clément Renault
694df94ead Move the has_searchable_children function to the appropriate module 2025-12-03 14:18:08 +01:00
Clément Renault
f0ba223c26 Rename the function to extract document words when a setting changes
Co-authored-By: Maxime Legendre <maxime@meilisearch.com>
2025-12-03 14:18:08 +01:00
Clément Renault
31c6e20ab2 Merge the logic of the function detecting searchable children fields 2025-12-03 14:18:08 +01:00
Clément Renault
67fa23d710 Fix a bug when nested fields appear
Co-authored-by: Many the fish <many@meilisearch.com>
2025-12-03 14:18:08 +01:00
Clément Renault
5dd45c4953 Add some comments
Co-authored-by: Many the fish <many@meilisearch.com>
2025-12-03 14:18:08 +01:00
Clément Renault
2772c06320 Fix a test trying to change settings with a wtxn 2025-12-03 14:18:08 +01:00
Clément Renault
8e6eea3cc3 Make sure the embedders supports changing searchables 2025-12-03 14:18:08 +01:00
Clément Renault
c8ed1158c4 Make sure we don't crash on unreferenced fields 2025-12-03 14:18:07 +01:00
Clément Renault
5d94d82d1f Make clippy happy 2025-12-03 14:18:07 +01:00
Clément Renault
2148eca563 Introduce new progress steps when deleting fid-based entries 2025-12-03 14:18:07 +01:00
Clément Renault
5f77e27853 Delete entries from fid-based databases when searchables are deleted 2025-12-03 14:18:07 +01:00
Clément Renault
4effc02b2d Support exact attributes in the settings delta 2025-12-03 14:18:07 +01:00
Clément Renault
f2918d421f Call the post processing in the new settings indexer 2025-12-03 14:18:07 +01:00
Clément Renault
f5d53aabfd Support exact attributes in the field metadata 2025-12-03 14:18:07 +01:00
Clément Renault
b1081d6148 Call the new searchable extractor 2025-12-03 14:18:07 +01:00
Clément Renault
d09566f751 Introduce the new searchable extractor 2025-12-03 14:18:06 +01:00
Clément Renault
1b1c396656 Enable the new settings indexer when the searchable or exact are updates 2025-12-03 14:18:06 +01:00
Kerollmops
2da48cdd34 Skip the macOS and Windows CI in the merge queue 2025-12-03 14:18:06 +01:00
Kerollmops
da733135c8 Update the snapshots 2025-12-03 14:17:11 +01:00
Kerollmops
c3f14b1f00 Bump the version to v1.28.2 2025-12-03 14:10:23 +01:00
32 changed files with 1236 additions and 161 deletions

View File

@@ -53,6 +53,7 @@ jobs:
matrix: matrix:
os: [macos-14, windows-2022] os: [macos-14, windows-2022]
features: ["", "--features enterprise"] features: ["", "--features enterprise"]
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v5
- name: Cache dependencies - name: Cache dependencies
@@ -158,8 +159,6 @@ jobs:
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v5
- uses: dtolnay/rust-toolchain@1.89 - uses: dtolnay/rust-toolchain@1.89
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.8.0
- name: Run tests in debug - name: Run tests in debug
uses: actions-rs/cargo@v1 uses: actions-rs/cargo@v1
with: with:

34
Cargo.lock generated
View File

@@ -580,7 +580,7 @@ source = "git+https://github.com/meilisearch/bbqueue#e8af4a4bccc8eb36b2b0442c4a9
[[package]] [[package]]
name = "benchmarks" name = "benchmarks"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bumpalo", "bumpalo",
@@ -790,7 +790,7 @@ dependencies = [
[[package]] [[package]]
name = "build-info" name = "build-info"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"time", "time",
@@ -1786,7 +1786,7 @@ dependencies = [
[[package]] [[package]]
name = "dump" name = "dump"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"big_s", "big_s",
@@ -2018,7 +2018,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]] [[package]]
name = "file-store" name = "file-store"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"tempfile", "tempfile",
"thiserror 2.0.17", "thiserror 2.0.17",
@@ -2040,7 +2040,7 @@ dependencies = [
[[package]] [[package]]
name = "filter-parser" name = "filter-parser"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"insta", "insta",
"levenshtein_automata", "levenshtein_automata",
@@ -2068,7 +2068,7 @@ dependencies = [
[[package]] [[package]]
name = "flatten-serde-json" name = "flatten-serde-json"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"criterion", "criterion",
"serde_json", "serde_json",
@@ -2231,7 +2231,7 @@ dependencies = [
[[package]] [[package]]
name = "fuzzers" name = "fuzzers"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"arbitrary", "arbitrary",
"bumpalo", "bumpalo",
@@ -3198,7 +3198,7 @@ dependencies = [
[[package]] [[package]]
name = "index-scheduler" name = "index-scheduler"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"backoff", "backoff",
@@ -3460,7 +3460,7 @@ dependencies = [
[[package]] [[package]]
name = "json-depth-checker" name = "json-depth-checker"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"criterion", "criterion",
"serde_json", "serde_json",
@@ -3974,7 +3974,7 @@ checksum = "ae960838283323069879657ca3de837e9f7bbb4c7bf6ea7f1b290d5e9476d2e0"
[[package]] [[package]]
name = "meili-snap" name = "meili-snap"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"insta", "insta",
"md5 0.8.0", "md5 0.8.0",
@@ -3985,7 +3985,7 @@ dependencies = [
[[package]] [[package]]
name = "meilisearch" name = "meilisearch"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"actix-cors", "actix-cors",
"actix-http", "actix-http",
@@ -4083,7 +4083,7 @@ dependencies = [
[[package]] [[package]]
name = "meilisearch-auth" name = "meilisearch-auth"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"base64 0.22.1", "base64 0.22.1",
"enum-iterator", "enum-iterator",
@@ -4102,7 +4102,7 @@ dependencies = [
[[package]] [[package]]
name = "meilisearch-types" name = "meilisearch-types"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"actix-web", "actix-web",
"anyhow", "anyhow",
@@ -4137,7 +4137,7 @@ dependencies = [
[[package]] [[package]]
name = "meilitool" name = "meilitool"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"clap", "clap",
@@ -4171,7 +4171,7 @@ dependencies = [
[[package]] [[package]]
name = "milli" name = "milli"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"arroy", "arroy",
"bbqueue", "bbqueue",
@@ -4750,7 +4750,7 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
[[package]] [[package]]
name = "permissive-json-pointer" name = "permissive-json-pointer"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"big_s", "big_s",
"serde_json", "serde_json",
@@ -7783,7 +7783,7 @@ dependencies = [
[[package]] [[package]]
name = "xtask" name = "xtask"
version = "1.28.1" version = "1.28.2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"build-info", "build-info",

View File

@@ -23,7 +23,7 @@ members = [
] ]
[workspace.package] [workspace.package]
version = "1.28.1" version = "1.28.2"
authors = [ authors = [
"Quentin de Quelen <quentin@dequelen.me>", "Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>", "Clément Renault <clement@meilisearch.com>",

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 28, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 28, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
@@ -57,7 +57,7 @@ girafo: { number_of_documents: 0, field_distribution: {} }
[timestamp] [4,] [timestamp] [4,]
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Batches: ### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.1"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } 0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, stop reason: "created batch containing only task with id 1 of type `indexCreation` that cannot be batched with any other task.", } 1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, stop reason: "created batch containing only task with id 1 of type `indexCreation` that cannot be batched with any other task.", }
2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 2 of type `indexCreation` that cannot be batched with any other task.", } 2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 2 of type `indexCreation` that cannot be batched with any other task.", }
3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 3 of type `indexCreation` that cannot be batched with any other task.", } 3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 3 of type `indexCreation` that cannot be batched with any other task.", }

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 28, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 28, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Status: ### Status:
enqueued [0,] enqueued [0,]

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 28, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 28, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Status: ### Status:

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 28, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 28, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Status: ### Status:
@@ -37,7 +37,7 @@ catto [1,]
[timestamp] [0,] [timestamp] [0,]
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Batches: ### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.1"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } 0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Batch to tasks mapping: ### Batch to tasks mapping:
0 [0,] 0 [0,]

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 28, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 28, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
---------------------------------------------------------------------- ----------------------------------------------------------------------
@@ -40,7 +40,7 @@ doggo [2,]
[timestamp] [0,] [timestamp] [0,]
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Batches: ### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.1"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } 0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Batch to tasks mapping: ### Batch to tasks mapping:
0 [0,] 0 [0,]

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 28, 1) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 28, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
3 {uid: 3, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
@@ -43,7 +43,7 @@ doggo [2,3,]
[timestamp] [0,] [timestamp] [0,]
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Batches: ### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.1"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } 0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.28.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Batch to tasks mapping: ### Batch to tasks mapping:
0 [0,] 0 [0,]

View File

@@ -183,7 +183,11 @@ pub async fn get_metrics(
crate::metrics::MEILISEARCH_LAST_FINISHED_BATCHES_PROGRESS_TRACE_MS.reset(); crate::metrics::MEILISEARCH_LAST_FINISHED_BATCHES_PROGRESS_TRACE_MS.reset();
let (batches, _total) = index_scheduler.get_batches_from_authorized_indexes( let (batches, _total) = index_scheduler.get_batches_from_authorized_indexes(
// Fetch the finished batches... // Fetch the finished batches...
&Query { statuses: Some(vec![Status::Succeeded, Status::Failed]), ..Query::default() }, &Query {
statuses: Some(vec![Status::Succeeded, Status::Failed]),
limit: Some(1),
..Query::default()
},
auth_filters, auth_filters,
)?; )?;
// ...and get the last batch only. // ...and get the last batch only.

View File

@@ -2,6 +2,7 @@ mod chat;
mod distinct; mod distinct;
mod errors; mod errors;
mod get_settings; mod get_settings;
mod parent_seachable_fields;
mod prefix_search_settings; mod prefix_search_settings;
mod proximity_settings; mod proximity_settings;
mod tokenizer_customization; mod tokenizer_customization;

View File

@@ -0,0 +1,114 @@
use meili_snap::{json_string, snapshot};
use once_cell::sync::Lazy;
use crate::common::Server;
use crate::json;
static DOCUMENTS: Lazy<crate::common::Value> = Lazy::new(|| {
json!([
{
"id": 1,
"meta": {
"title": "Soup of the day",
"description": "many the fish",
}
},
{
"id": 2,
"meta": {
"title": "Soup of day",
"description": "many the lazy fish",
}
},
{
"id": 3,
"meta": {
"title": "the Soup of day",
"description": "many the fish",
}
},
])
});
#[actix_rt::test]
async fn nested_field_becomes_searchable() {
let server = Server::new_shared();
let index = server.unique_index();
let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await;
server.wait_task(task.uid()).await.succeeded();
let (response, code) = index
.update_settings(json!({
"searchableAttributes": ["meta.title"]
}))
.await;
assert_eq!("202", code.as_str(), "{response:?}");
server.wait_task(response.uid()).await.succeeded();
// We expect no documents when searching for
// a nested non-searchable field
index
.search(json!({"q": "many fish"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"[]"###);
})
.await;
let (response, code) = index
.update_settings(json!({
"searchableAttributes": ["meta.title", "meta.description"]
}))
.await;
assert_eq!("202", code.as_str(), "{response:?}");
server.wait_task(response.uid()).await.succeeded();
// We expect all the documents when the nested field becomes searchable
index
.search(json!({"q": "many fish"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"meta": {
"title": "Soup of the day",
"description": "many the fish"
}
},
{
"id": 3,
"meta": {
"title": "the Soup of day",
"description": "many the fish"
}
},
{
"id": 2,
"meta": {
"title": "Soup of day",
"description": "many the lazy fish"
}
}
]
"###);
})
.await;
let (response, code) = index
.update_settings(json!({
"searchableAttributes": ["meta.title"]
}))
.await;
assert_eq!("202", code.as_str(), "{response:?}");
server.wait_task(response.uid()).await.succeeded();
// We expect no documents when searching for
// a nested non-searchable field
index
.search(json!({"q": "many fish"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"[]"###);
})
.await;
}

View File

@@ -43,7 +43,7 @@ async fn version_too_old() {
std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap(); std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap();
let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let options = Opt { experimental_dumpless_upgrade: true, ..default_settings };
let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err();
snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.28.1"); snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.28.2");
} }
#[actix_rt::test] #[actix_rt::test]
@@ -58,7 +58,7 @@ async fn version_requires_downgrade() {
std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap(); std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap();
let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let options = Opt { experimental_dumpless_upgrade: true, ..default_settings };
let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err();
snapshot!(err, @"Database version 1.28.2 is higher than the Meilisearch version 1.28.1. Downgrade is not supported"); snapshot!(err, @"Database version 1.28.3 is higher than the Meilisearch version 1.28.2. Downgrade is not supported");
} }
#[actix_rt::test] #[actix_rt::test]

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null, "progress": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"stats": { "stats": {
"totalNbTasks": 1, "totalNbTasks": 1,

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null, "progress": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"stats": { "stats": {
"totalNbTasks": 1, "totalNbTasks": 1,

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null, "progress": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"stats": { "stats": {
"totalNbTasks": 1, "totalNbTasks": 1,

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"error": null, "error": null,
"duration": "[duration]", "duration": "[duration]",

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"error": null, "error": null,
"duration": "[duration]", "duration": "[duration]",

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"error": null, "error": null,
"duration": "[duration]", "duration": "[duration]",

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null, "progress": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"stats": { "stats": {
"totalNbTasks": 1, "totalNbTasks": 1,

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"upgradeFrom": "v1.12.0", "upgradeFrom": "v1.12.0",
"upgradeTo": "v1.28.1" "upgradeTo": "v1.28.2"
}, },
"error": null, "error": null,
"duration": "[duration]", "duration": "[duration]",

View File

@@ -18,6 +18,8 @@ use crate::{
pub struct Metadata { pub struct Metadata {
/// The weight as defined in the FieldidsWeightsMap of the searchable attribute if it is searchable. /// The weight as defined in the FieldidsWeightsMap of the searchable attribute if it is searchable.
pub searchable: Option<Weight>, pub searchable: Option<Weight>,
/// The field is part of the exact attributes.
pub exact: bool,
/// The field is part of the sortable attributes. /// The field is part of the sortable attributes.
pub sortable: bool, pub sortable: bool,
/// The field is defined as the distinct attribute. /// The field is defined as the distinct attribute.
@@ -209,6 +211,7 @@ impl Metadata {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct MetadataBuilder { pub struct MetadataBuilder {
searchable_attributes: Option<Vec<String>>, searchable_attributes: Option<Vec<String>>,
exact_searchable_attributes: Vec<String>,
filterable_attributes: Vec<FilterableAttributesRule>, filterable_attributes: Vec<FilterableAttributesRule>,
sortable_attributes: HashSet<String>, sortable_attributes: HashSet<String>,
localized_attributes: Option<Vec<LocalizedAttributesRule>>, localized_attributes: Option<Vec<LocalizedAttributesRule>>,
@@ -220,15 +223,18 @@ impl MetadataBuilder {
pub fn from_index(index: &Index, rtxn: &RoTxn) -> Result<Self> { pub fn from_index(index: &Index, rtxn: &RoTxn) -> Result<Self> {
let searchable_attributes = index let searchable_attributes = index
.user_defined_searchable_fields(rtxn)? .user_defined_searchable_fields(rtxn)?
.map(|fields| fields.into_iter().map(|s| s.to_string()).collect()); .map(|fields| fields.into_iter().map(String::from).collect());
let exact_searchable_attributes =
index.exact_attributes(rtxn)?.into_iter().map(String::from).collect();
let filterable_attributes = index.filterable_attributes_rules(rtxn)?; let filterable_attributes = index.filterable_attributes_rules(rtxn)?;
let sortable_attributes = index.sortable_fields(rtxn)?; let sortable_attributes = index.sortable_fields(rtxn)?;
let localized_attributes = index.localized_attributes_rules(rtxn)?; let localized_attributes = index.localized_attributes_rules(rtxn)?;
let distinct_attribute = index.distinct_field(rtxn)?.map(|s| s.to_string()); let distinct_attribute = index.distinct_field(rtxn)?.map(String::from);
let asc_desc_attributes = index.asc_desc_fields(rtxn)?; let asc_desc_attributes = index.asc_desc_fields(rtxn)?;
Ok(Self::new( Ok(Self::new(
searchable_attributes, searchable_attributes,
exact_searchable_attributes,
filterable_attributes, filterable_attributes,
sortable_attributes, sortable_attributes,
localized_attributes, localized_attributes,
@@ -242,6 +248,7 @@ impl MetadataBuilder {
/// This is used for testing, prefer using `MetadataBuilder::from_index` instead. /// This is used for testing, prefer using `MetadataBuilder::from_index` instead.
pub fn new( pub fn new(
searchable_attributes: Option<Vec<String>>, searchable_attributes: Option<Vec<String>>,
exact_searchable_attributes: Vec<String>,
filterable_attributes: Vec<FilterableAttributesRule>, filterable_attributes: Vec<FilterableAttributesRule>,
sortable_attributes: HashSet<String>, sortable_attributes: HashSet<String>,
localized_attributes: Option<Vec<LocalizedAttributesRule>>, localized_attributes: Option<Vec<LocalizedAttributesRule>>,
@@ -256,6 +263,7 @@ impl MetadataBuilder {
Self { Self {
searchable_attributes, searchable_attributes,
exact_searchable_attributes,
filterable_attributes, filterable_attributes,
sortable_attributes, sortable_attributes,
localized_attributes, localized_attributes,
@@ -269,6 +277,7 @@ impl MetadataBuilder {
// Vectors fields are not searchable, filterable, distinct or asc_desc // Vectors fields are not searchable, filterable, distinct or asc_desc
return Metadata { return Metadata {
searchable: None, searchable: None,
exact: false,
sortable: false, sortable: false,
distinct: false, distinct: false,
asc_desc: false, asc_desc: false,
@@ -296,6 +305,7 @@ impl MetadataBuilder {
// Geo fields are not searchable, distinct or asc_desc // Geo fields are not searchable, distinct or asc_desc
return Metadata { return Metadata {
searchable: None, searchable: None,
exact: false,
sortable, sortable,
distinct: false, distinct: false,
asc_desc: false, asc_desc: false,
@@ -309,6 +319,7 @@ impl MetadataBuilder {
debug_assert!(!sortable, "geojson fields should not be sortable"); debug_assert!(!sortable, "geojson fields should not be sortable");
return Metadata { return Metadata {
searchable: None, searchable: None,
exact: false,
sortable, sortable,
distinct: false, distinct: false,
asc_desc: false, asc_desc: false,
@@ -329,6 +340,8 @@ impl MetadataBuilder {
None => Some(0), None => Some(0),
}; };
let exact = self.exact_searchable_attributes.iter().any(|attr| is_faceted_by(field, attr));
let distinct = let distinct =
self.distinct_attribute.as_ref().is_some_and(|distinct_field| field == distinct_field); self.distinct_attribute.as_ref().is_some_and(|distinct_field| field == distinct_field);
let asc_desc = self.asc_desc_attributes.contains(field); let asc_desc = self.asc_desc_attributes.contains(field);
@@ -343,6 +356,7 @@ impl MetadataBuilder {
Metadata { Metadata {
searchable, searchable,
exact,
sortable, sortable,
distinct, distinct,
asc_desc, asc_desc,

View File

@@ -8,17 +8,26 @@ use bumpalo::Bump;
use super::match_searchable_field; use super::match_searchable_field;
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::fields_ids_map::metadata::Metadata;
use crate::update::new::document::DocumentContext; use crate::update::new::document::DocumentContext;
use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::cache::BalancedCaches;
use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::extract::perm_json_p::contained_in;
use crate::update::new::extract::searchable::has_searchable_children;
use crate::update::new::indexer::document_changes::{ use crate::update::new::indexer::document_changes::{
extract, DocumentChanges, Extractor, IndexingContext, extract, DocumentChanges, Extractor, IndexingContext,
}; };
use crate::update::new::indexer::settings_changes::{
settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor,
};
use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::IndexingStep; use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::update::new::DocumentChange; use crate::update::new::{DocumentChange, DocumentIdentifiers};
use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; use crate::update::settings::SettingsDelta;
use crate::{
bucketed_position, DocumentId, FieldId, PatternMatch, Result, UserError,
MAX_POSITION_PER_ATTRIBUTE,
};
const MAX_COUNTED_WORDS: usize = 30; const MAX_COUNTED_WORDS: usize = 30;
@@ -34,6 +43,15 @@ pub struct WordDocidsBalancedCaches<'extractor> {
unsafe impl MostlySend for WordDocidsBalancedCaches<'_> {} unsafe impl MostlySend for WordDocidsBalancedCaches<'_> {}
/// Whether to extract or skip fields during word extraction.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum FieldDbExtraction {
/// Extract the word and put it in to the fid-based databases.
Extract,
/// Do not store the word in the fid-based databases.
Skip,
}
impl<'extractor> WordDocidsBalancedCaches<'extractor> { impl<'extractor> WordDocidsBalancedCaches<'extractor> {
pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self { pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
Self { Self {
@@ -47,12 +65,14 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
} }
} }
#[allow(clippy::too_many_arguments)]
fn insert_add_u32( fn insert_add_u32(
&mut self, &mut self,
field_id: FieldId, field_id: FieldId,
position: u16, position: u16,
word: &str, word: &str,
exact: bool, exact: bool,
field_db_extraction: FieldDbExtraction,
docid: u32, docid: u32,
bump: &Bump, bump: &Bump,
) -> Result<()> { ) -> Result<()> {
@@ -66,11 +86,13 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>(); let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump); let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
buffer.clear(); if field_db_extraction == FieldDbExtraction::Extract {
buffer.extend_from_slice(word_bytes); buffer.clear();
buffer.push(0); buffer.extend_from_slice(word_bytes);
buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0);
self.word_fid_docids.insert_add_u32(&buffer, docid)?; buffer.extend_from_slice(&field_id.to_be_bytes());
self.word_fid_docids.insert_add_u32(&buffer, docid)?;
}
let position = bucketed_position(position); let position = bucketed_position(position);
buffer.clear(); buffer.clear();
@@ -83,21 +105,26 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
self.flush_fid_word_count(&mut buffer)?; self.flush_fid_word_count(&mut buffer)?;
} }
self.fid_word_count if field_db_extraction == FieldDbExtraction::Extract {
.entry(field_id) self.fid_word_count
.and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1) .entry(field_id)
.or_insert((None, Some(1))); .and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1)
.or_insert((None, Some(1)));
}
self.current_docid = Some(docid); self.current_docid = Some(docid);
Ok(()) Ok(())
} }
#[allow(clippy::too_many_arguments)]
fn insert_del_u32( fn insert_del_u32(
&mut self, &mut self,
field_id: FieldId, field_id: FieldId,
position: u16, position: u16,
word: &str, word: &str,
exact: bool, exact: bool,
field_db_extraction: FieldDbExtraction,
docid: u32, docid: u32,
bump: &Bump, bump: &Bump,
) -> Result<()> { ) -> Result<()> {
@@ -111,11 +138,13 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>(); let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump); let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
buffer.clear(); if field_db_extraction == FieldDbExtraction::Extract {
buffer.extend_from_slice(word_bytes); buffer.clear();
buffer.push(0); buffer.extend_from_slice(word_bytes);
buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0);
self.word_fid_docids.insert_del_u32(&buffer, docid)?; buffer.extend_from_slice(&field_id.to_be_bytes());
self.word_fid_docids.insert_del_u32(&buffer, docid)?;
}
let position = bucketed_position(position); let position = bucketed_position(position);
buffer.clear(); buffer.clear();
@@ -128,10 +157,12 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
self.flush_fid_word_count(&mut buffer)?; self.flush_fid_word_count(&mut buffer)?;
} }
self.fid_word_count if field_db_extraction == FieldDbExtraction::Extract {
.entry(field_id) self.fid_word_count
.and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1) .entry(field_id)
.or_insert((Some(1), None)); .and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1)
.or_insert((Some(1), None));
}
self.current_docid = Some(docid); self.current_docid = Some(docid);
@@ -325,6 +356,24 @@ impl WordDocidsExtractors {
exact_attributes.iter().any(|attr| contained_in(fname, attr)) exact_attributes.iter().any(|attr| contained_in(fname, attr))
|| disabled_typos_terms.is_exact(word) || disabled_typos_terms.is_exact(word)
}; };
let mut should_tokenize = |field_name: &str| {
let Some((field_id, meta)) = new_fields_ids_map.id_with_metadata_or_insert(field_name)
else {
return Err(UserError::AttributeLimitReached.into());
};
let pattern_match = if meta.is_searchable() {
PatternMatch::Match
} else {
// TODO: should be a match on the field_name using `match_field_legacy` function,
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
PatternMatch::Parent
};
Ok((field_id, pattern_match))
};
match document_change { match document_change {
DocumentChange::Deletion(inner) => { DocumentChange::Deletion(inner) => {
let mut token_fn = |fname: &str, fid, pos, word: &str| { let mut token_fn = |fname: &str, fid, pos, word: &str| {
@@ -333,13 +382,14 @@ impl WordDocidsExtractors {
pos, pos,
word, word,
is_exact(fname, word), is_exact(fname, word),
FieldDbExtraction::Extract,
inner.docid(), inner.docid(),
doc_alloc, doc_alloc,
) )
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.current(rtxn, index, context.db_fields_ids_map)?, inner.current(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map, &mut should_tokenize,
&mut token_fn, &mut token_fn,
)?; )?;
} }
@@ -361,13 +411,14 @@ impl WordDocidsExtractors {
pos, pos,
word, word,
is_exact(fname, word), is_exact(fname, word),
FieldDbExtraction::Extract,
inner.docid(), inner.docid(),
doc_alloc, doc_alloc,
) )
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.current(rtxn, index, context.db_fields_ids_map)?, inner.current(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map, &mut should_tokenize,
&mut token_fn, &mut token_fn,
)?; )?;
@@ -377,13 +428,14 @@ impl WordDocidsExtractors {
pos, pos,
word, word,
is_exact(fname, word), is_exact(fname, word),
FieldDbExtraction::Extract,
inner.docid(), inner.docid(),
doc_alloc, doc_alloc,
) )
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.merged(rtxn, index, context.db_fields_ids_map)?, inner.merged(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map, &mut should_tokenize,
&mut token_fn, &mut token_fn,
)?; )?;
} }
@@ -394,13 +446,14 @@ impl WordDocidsExtractors {
pos, pos,
word, word,
is_exact(fname, word), is_exact(fname, word),
FieldDbExtraction::Extract,
inner.docid(), inner.docid(),
doc_alloc, doc_alloc,
) )
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.inserted(), inner.inserted(),
new_fields_ids_map, &mut should_tokenize,
&mut token_fn, &mut token_fn,
)?; )?;
} }
@@ -411,3 +464,292 @@ impl WordDocidsExtractors {
cached_sorter.flush_fid_word_count(&mut buffer) cached_sorter.flush_fid_word_count(&mut buffer)
} }
} }
pub struct WordDocidsSettingsExtractorsData<'a, SD> {
tokenizer: DocumentTokenizer<'a>,
max_memory_by_thread: Option<usize>,
buckets: usize,
settings_delta: &'a SD,
}
impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
for WordDocidsSettingsExtractorsData<'_, SD>
{
type Data = RefCell<Option<WordDocidsBalancedCaches<'extractor>>>;
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result<Self::Data> {
Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
self.buckets,
self.max_memory_by_thread,
extractor_alloc,
))))
}
fn process<'doc>(
&'doc self,
documents: impl Iterator<Item = crate::Result<DocumentIdentifiers<'doc>>>,
context: &'doc DocumentContext<Self::Data>,
) -> crate::Result<()> {
for document in documents {
let document = document?;
SettingsChangeWordDocidsExtractors::extract_document_from_settings_change(
document,
context,
&self.tokenizer,
self.settings_delta,
)?;
}
Ok(())
}
}
pub struct SettingsChangeWordDocidsExtractors;
impl SettingsChangeWordDocidsExtractors {
pub fn run_extraction<'fid, 'indexer, 'index, 'extractor, SD, MSP>(
settings_delta: &SD,
documents: &'indexer DocumentsIndentifiers<'indexer>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
) -> Result<WordDocidsCaches<'extractor>>
where
SD: SettingsDelta + Sync,
MSP: Fn() -> bool + Sync,
{
// Warning: this is duplicated code from extract_word_pair_proximity_docids.rs
// TODO we need to read the new AND old settings to support changing global parameters
let rtxn = indexing_context.index.read_txn()?;
let stop_words = indexing_context.index.stop_words(&rtxn)?;
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
let allowed_separators: Option<Vec<_>> =
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
let dictionary = indexing_context.index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut builder = tokenizer_builder(
stop_words.as_ref(),
allowed_separators.as_deref(),
dictionary.as_deref(),
);
let tokenizer = builder.build();
let localized_attributes_rules =
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
let document_tokenizer = DocumentTokenizer {
tokenizer: &tokenizer,
localized_attributes_rules: &localized_attributes_rules,
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
};
let extractor_data = WordDocidsSettingsExtractorsData {
tokenizer: document_tokenizer,
max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
buckets: rayon::current_num_threads(),
settings_delta,
};
let datastore = ThreadLocal::new();
{
let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors");
let _entered = span.enter();
settings_change_extract(
documents,
&extractor_data,
indexing_context,
extractor_allocs,
&datastore,
step,
)?;
}
let mut merger = WordDocidsCaches::new();
for cache in datastore.into_iter().flat_map(RefCell::into_inner) {
merger.push(cache)?;
}
Ok(merger)
}
/// Extracts document words from a settings change.
fn extract_document_from_settings_change<SD: SettingsDelta>(
document: DocumentIdentifiers<'_>,
context: &DocumentContext<RefCell<Option<WordDocidsBalancedCaches>>>,
document_tokenizer: &DocumentTokenizer,
settings_delta: &SD,
) -> Result<()> {
let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
let cached_sorter = cached_sorter_ref.as_mut().unwrap();
let doc_alloc = &context.doc_alloc;
let new_fields_ids_map = settings_delta.new_fields_ids_map();
let old_fields_ids_map = context.index.fields_ids_map_with_metadata(&context.rtxn)?;
let old_searchable = settings_delta.old_searchable_attributes().as_ref();
let new_searchable = settings_delta.new_searchable_attributes().as_ref();
let current_document = document.current(
&context.rtxn,
context.index,
old_fields_ids_map.as_fields_ids_map(),
)?;
#[derive(Debug, Clone, Copy, PartialEq)]
enum ActionToOperate {
ReindexAllFields,
// TODO improve by listing field prefixes
IndexAddedFields,
SkipDocument,
}
let mut action = ActionToOperate::SkipDocument;
// Here we do a preliminary check to determine the action to take.
// This check doesn't trigger the tokenizer as we never return
// PatternMatch::Match.
document_tokenizer.tokenize_document(
current_document,
&mut |field_name| {
let fid = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
// If the document must be reindexed, early return NoMatch to stop the scanning process.
if action == ActionToOperate::ReindexAllFields {
return Ok((fid, PatternMatch::NoMatch));
}
let old_field_metadata = old_fields_ids_map.metadata(fid).unwrap();
let new_field_metadata = new_fields_ids_map.metadata(fid).unwrap();
action = match (old_field_metadata, new_field_metadata) {
// At least one field is added or removed from the exact fields => ReindexAllFields
(Metadata { exact: old_exact, .. }, Metadata { exact: new_exact, .. })
if old_exact != new_exact =>
{
ActionToOperate::ReindexAllFields
}
// At least one field is removed from the searchable fields => ReindexAllFields
(Metadata { searchable: Some(_), .. }, Metadata { searchable: None, .. }) => {
ActionToOperate::ReindexAllFields
}
// At least one field is added in the searchable fields => IndexAddedFields
(Metadata { searchable: None, .. }, Metadata { searchable: Some(_), .. }) => {
// We can safely overwrite the action, because we early return when action is ReindexAllFields.
ActionToOperate::IndexAddedFields
}
_ => action,
};
Ok((fid, PatternMatch::Parent))
},
&mut |_, _, _, _| Ok(()),
)?;
// Early return when we don't need to index the document
if action == ActionToOperate::SkipDocument {
return Ok(());
}
let mut should_tokenize = |field_name: &str| {
let field_id = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
let pattern_match = match action {
ActionToOperate::ReindexAllFields => {
if old_field_metadata.is_searchable() || new_field_metadata.is_searchable() {
PatternMatch::Match
// If any old or new field is searchable then we need to iterate over all fields
// else if any field matches we need to iterate over all fields
} else if has_searchable_children(
field_name,
old_searchable.zip(new_searchable).map(|(old, new)| old.iter().chain(new)),
) {
PatternMatch::Parent
} else {
PatternMatch::NoMatch
}
}
ActionToOperate::IndexAddedFields => {
// Was not searchable but now is
if !old_field_metadata.is_searchable() && new_field_metadata.is_searchable() {
PatternMatch::Match
// If the field is now a parent of a searchable field
} else if has_searchable_children(field_name, new_searchable) {
PatternMatch::Parent
} else {
PatternMatch::NoMatch
}
}
ActionToOperate::SkipDocument => unreachable!(),
};
Ok((field_id, pattern_match))
};
let old_disabled_typos_terms = settings_delta.old_disabled_typos_terms();
let new_disabled_typos_terms = settings_delta.new_disabled_typos_terms();
let mut token_fn = |_field_name: &str, field_id, pos, word: &str| {
let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
match (old_field_metadata, new_field_metadata) {
(
Metadata { searchable: Some(_), exact: old_exact, .. },
Metadata { searchable: None, .. },
) => cached_sorter.insert_del_u32(
field_id,
pos,
word,
old_exact || old_disabled_typos_terms.is_exact(word),
// We deleted the field globally
FieldDbExtraction::Skip,
document.docid(),
doc_alloc,
),
(
Metadata { searchable: None, .. },
Metadata { searchable: Some(_), exact: new_exact, .. },
) => cached_sorter.insert_add_u32(
field_id,
pos,
word,
new_exact || new_disabled_typos_terms.is_exact(word),
FieldDbExtraction::Extract,
document.docid(),
doc_alloc,
),
(Metadata { searchable: None, .. }, Metadata { searchable: None, .. }) => {
unreachable!()
}
(Metadata { exact: old_exact, .. }, Metadata { exact: new_exact, .. }) => {
cached_sorter.insert_del_u32(
field_id,
pos,
word,
old_exact || old_disabled_typos_terms.is_exact(word),
// The field has already been extracted
FieldDbExtraction::Skip,
document.docid(),
doc_alloc,
)?;
cached_sorter.insert_add_u32(
field_id,
pos,
word,
new_exact || new_disabled_typos_terms.is_exact(word),
// The field has already been extracted
FieldDbExtraction::Skip,
document.docid(),
doc_alloc,
)
}
}
};
// TODO we must tokenize twice when we change global parameters like stop words,
// the language settings, dictionary, separators, non-separators...
document_tokenizer.tokenize_document(
current_document,
&mut should_tokenize,
&mut token_fn,
)?;
Ok(())
}
}

View File

@@ -6,17 +6,24 @@ use bumpalo::Bump;
use super::match_searchable_field; use super::match_searchable_field;
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::fields_ids_map::metadata::Metadata;
use crate::proximity::ProximityPrecision::*;
use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::proximity::{index_proximity, MAX_DISTANCE};
use crate::update::new::document::{Document, DocumentContext}; use crate::update::new::document::{Document, DocumentContext};
use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::cache::BalancedCaches;
use crate::update::new::indexer::document_changes::{ use crate::update::new::indexer::document_changes::{
extract, DocumentChanges, Extractor, IndexingContext, extract, DocumentChanges, Extractor, IndexingContext,
}; };
use crate::update::new::indexer::settings_change_extract;
use crate::update::new::indexer::settings_changes::{
DocumentsIndentifiers, SettingsChangeExtractor,
};
use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::IndexingStep; use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange; use crate::update::new::{DocumentChange, DocumentIdentifiers};
use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE}; use crate::update::settings::SettingsDelta;
use crate::{FieldId, PatternMatch, Result, UserError, MAX_POSITION_PER_ATTRIBUTE};
pub struct WordPairProximityDocidsExtractorData<'a> { pub struct WordPairProximityDocidsExtractorData<'a> {
tokenizer: DocumentTokenizer<'a>, tokenizer: DocumentTokenizer<'a>,
@@ -116,7 +123,7 @@ impl WordPairProximityDocidsExtractor {
// and to store the docids of the documents that have a number of words in a given field // and to store the docids of the documents that have a number of words in a given field
// equal to or under than MAX_COUNTED_WORDS. // equal to or under than MAX_COUNTED_WORDS.
fn extract_document_change( fn extract_document_change(
context: &DocumentContext<RefCell<BalancedCaches>>, context: &DocumentContext<RefCell<BalancedCaches<'_>>>,
document_tokenizer: &DocumentTokenizer, document_tokenizer: &DocumentTokenizer,
searchable_attributes: Option<&[&str]>, searchable_attributes: Option<&[&str]>,
document_change: DocumentChange, document_change: DocumentChange,
@@ -147,8 +154,12 @@ impl WordPairProximityDocidsExtractor {
process_document_tokens( process_document_tokens(
document, document,
document_tokenizer, document_tokenizer,
new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |field_name| {
new_fields_ids_map
.id_with_metadata_or_insert(field_name)
.ok_or(UserError::AttributeLimitReached.into())
},
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
del_word_pair_proximity.push(((w1, w2), prox)); del_word_pair_proximity.push(((w1, w2), prox));
}, },
@@ -170,8 +181,12 @@ impl WordPairProximityDocidsExtractor {
process_document_tokens( process_document_tokens(
document, document,
document_tokenizer, document_tokenizer,
new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |field_name| {
new_fields_ids_map
.id_with_metadata_or_insert(field_name)
.ok_or(UserError::AttributeLimitReached.into())
},
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
del_word_pair_proximity.push(((w1, w2), prox)); del_word_pair_proximity.push(((w1, w2), prox));
}, },
@@ -180,8 +195,12 @@ impl WordPairProximityDocidsExtractor {
process_document_tokens( process_document_tokens(
document, document,
document_tokenizer, document_tokenizer,
new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |field_name| {
new_fields_ids_map
.id_with_metadata_or_insert(field_name)
.ok_or(UserError::AttributeLimitReached.into())
},
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
add_word_pair_proximity.push(((w1, w2), prox)); add_word_pair_proximity.push(((w1, w2), prox));
}, },
@@ -192,8 +211,12 @@ impl WordPairProximityDocidsExtractor {
process_document_tokens( process_document_tokens(
document, document,
document_tokenizer, document_tokenizer,
new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |field_name| {
new_fields_ids_map
.id_with_metadata_or_insert(field_name)
.ok_or(UserError::AttributeLimitReached.into())
},
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
add_word_pair_proximity.push(((w1, w2), prox)); add_word_pair_proximity.push(((w1, w2), prox));
}, },
@@ -257,8 +280,8 @@ fn drain_word_positions(
fn process_document_tokens<'doc>( fn process_document_tokens<'doc>(
document: impl Document<'doc>, document: impl Document<'doc>,
document_tokenizer: &DocumentTokenizer, document_tokenizer: &DocumentTokenizer,
fields_ids_map: &mut GlobalFieldsIdsMap,
word_positions: &mut VecDeque<(Rc<str>, u16)>, word_positions: &mut VecDeque<(Rc<str>, u16)>,
field_id_and_metadata: &mut impl FnMut(&str) -> Result<(FieldId, Metadata)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8), word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
) -> Result<()> { ) -> Result<()> {
let mut field_id = None; let mut field_id = None;
@@ -279,8 +302,248 @@ fn process_document_tokens<'doc>(
word_positions.push_back((Rc::from(word), pos)); word_positions.push_back((Rc::from(word), pos));
Ok(()) Ok(())
}; };
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
let mut should_tokenize = |field_name: &str| {
let (field_id, meta) = field_id_and_metadata(field_name)?;
let pattern_match = if meta.is_searchable() {
PatternMatch::Match
} else {
// TODO: should be a match on the field_name using `match_field_legacy` function,
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
PatternMatch::Parent
};
Ok((field_id, pattern_match))
};
document_tokenizer.tokenize_document(document, &mut should_tokenize, &mut token_fn)?;
drain_word_positions(word_positions, word_pair_proximity); drain_word_positions(word_positions, word_pair_proximity);
Ok(()) Ok(())
} }
pub struct WordPairProximityDocidsSettingsExtractorsData<'a, SD> {
tokenizer: DocumentTokenizer<'a>,
max_memory_by_thread: Option<usize>,
buckets: usize,
settings_delta: &'a SD,
}
impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
for WordPairProximityDocidsSettingsExtractorsData<'_, SD>
{
type Data = RefCell<BalancedCaches<'extractor>>;
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result<Self::Data> {
Ok(RefCell::new(BalancedCaches::new_in(
self.buckets,
self.max_memory_by_thread,
extractor_alloc,
)))
}
fn process<'doc>(
&'doc self,
documents: impl Iterator<Item = crate::Result<DocumentIdentifiers<'doc>>>,
context: &'doc DocumentContext<Self::Data>,
) -> crate::Result<()> {
for document in documents {
let document = document?;
SettingsChangeWordPairProximityDocidsExtractors::extract_document_from_settings_change(
document,
context,
&self.tokenizer,
self.settings_delta,
)?;
}
Ok(())
}
}
pub struct SettingsChangeWordPairProximityDocidsExtractors;
impl SettingsChangeWordPairProximityDocidsExtractors {
pub fn run_extraction<'fid, 'indexer, 'index, 'extractor, SD, MSP>(
settings_delta: &SD,
documents: &'indexer DocumentsIndentifiers<'indexer>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
SD: SettingsDelta + Sync,
MSP: Fn() -> bool + Sync,
{
// Warning: this is duplicated code from extract_word_docids.rs
let rtxn = indexing_context.index.read_txn()?;
let stop_words = indexing_context.index.stop_words(&rtxn)?;
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
let allowed_separators: Option<Vec<_>> =
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
let dictionary = indexing_context.index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut builder = tokenizer_builder(
stop_words.as_ref(),
allowed_separators.as_deref(),
dictionary.as_deref(),
);
let tokenizer = builder.build();
let localized_attributes_rules =
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
let document_tokenizer = DocumentTokenizer {
tokenizer: &tokenizer,
localized_attributes_rules: &localized_attributes_rules,
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
};
let extractor_data = WordPairProximityDocidsSettingsExtractorsData {
tokenizer: document_tokenizer,
max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
buckets: rayon::current_num_threads(),
settings_delta,
};
let datastore = ThreadLocal::new();
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids_extraction");
let _entered = span.enter();
settings_change_extract(
documents,
&extractor_data,
indexing_context,
extractor_allocs,
&datastore,
step,
)?;
}
Ok(datastore.into_iter().map(RefCell::into_inner).collect())
}
/// Extracts document words from a settings change.
fn extract_document_from_settings_change<SD: SettingsDelta>(
document: DocumentIdentifiers<'_>,
context: &DocumentContext<RefCell<BalancedCaches<'_>>>,
document_tokenizer: &DocumentTokenizer,
settings_delta: &SD,
) -> Result<()> {
let mut cached_sorter = context.data.borrow_mut_or_yield();
let doc_alloc = &context.doc_alloc;
let new_fields_ids_map = settings_delta.new_fields_ids_map();
let old_fields_ids_map = settings_delta.old_fields_ids_map();
let old_proximity_precision = *settings_delta.old_proximity_precision();
let new_proximity_precision = *settings_delta.new_proximity_precision();
let current_document = document.current(
&context.rtxn,
context.index,
old_fields_ids_map.as_fields_ids_map(),
)?;
#[derive(Debug, Clone, Copy, PartialEq)]
enum ActionToOperate {
ReindexAllFields,
SkipDocument,
}
// TODO prefix_fid delete_old_fid_based_databases
let mut action = match (old_proximity_precision, new_proximity_precision) {
(ByAttribute, ByWord) => ActionToOperate::ReindexAllFields,
(_, _) => ActionToOperate::SkipDocument,
};
// Here we do a preliminary check to determine the action to take.
// This check doesn't trigger the tokenizer as we never return
// PatternMatch::Match.
if action != ActionToOperate::ReindexAllFields {
document_tokenizer.tokenize_document(
current_document,
&mut |field_name| {
let fid = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
// If the document must be reindexed, early return NoMatch to stop the scanning process.
if action == ActionToOperate::ReindexAllFields {
return Ok((fid, PatternMatch::NoMatch));
}
let old_field_metadata = old_fields_ids_map.metadata(fid).unwrap();
let new_field_metadata = new_fields_ids_map.metadata(fid).unwrap();
action = match (old_field_metadata, new_field_metadata) {
// At least one field is removed or added from the searchable fields
(
Metadata { searchable: Some(_), .. },
Metadata { searchable: None, .. },
)
| (
Metadata { searchable: None, .. },
Metadata { searchable: Some(_), .. },
) => ActionToOperate::ReindexAllFields,
_ => action,
};
Ok((fid, PatternMatch::Parent))
},
&mut |_, _, _, _| Ok(()),
)?;
}
// Early return when we don't need to index the document
if action == ActionToOperate::SkipDocument {
return Ok(());
}
let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
// is a vecdequeue, and will be smol, so can stay on the heap for now
let mut word_positions: VecDeque<(Rc<str>, u16)> =
VecDeque::with_capacity(MAX_DISTANCE as usize);
process_document_tokens(
current_document,
// TODO Tokenize must be based on old settings
document_tokenizer,
&mut word_positions,
&mut |field_name| {
Ok(old_fields_ids_map.id_with_metadata(field_name).expect("All fields must exist"))
},
&mut |(w1, w2), prox| {
del_word_pair_proximity.push(((w1, w2), prox));
},
)?;
process_document_tokens(
current_document,
// TODO Tokenize must be based on new settings
document_tokenizer,
&mut word_positions,
&mut |field_name| {
Ok(new_fields_ids_map.id_with_metadata(field_name).expect("All fields must exist"))
},
&mut |(w1, w2), prox| {
add_word_pair_proximity.push(((w1, w2), prox));
},
)?;
let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
del_word_pair_proximity.sort_unstable();
del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in del_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, document.docid())?;
}
add_word_pair_proximity.sort_unstable();
add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in add_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, document.docid())?;
}
Ok(())
}
}

View File

@@ -2,8 +2,12 @@ mod extract_word_docids;
mod extract_word_pair_proximity_docids; mod extract_word_pair_proximity_docids;
mod tokenize_document; mod tokenize_document;
pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors}; pub use extract_word_docids::{
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; SettingsChangeWordDocidsExtractors, WordDocidsCaches, WordDocidsExtractors,
};
pub use extract_word_pair_proximity_docids::{
SettingsChangeWordPairProximityDocidsExtractors, WordPairProximityDocidsExtractor,
};
use crate::attribute_patterns::{match_field_legacy, PatternMatch}; use crate::attribute_patterns::{match_field_legacy, PatternMatch};
@@ -27,3 +31,17 @@ pub fn match_searchable_field(
selection selection
} }
/// return `true` if the provided `field_name` is a parent of at least one of the fields contained in `searchable`,
/// or if `searchable` is `None`.
fn has_searchable_children<I, A>(field_name: &str, searchable: Option<I>) -> bool
where
I: IntoIterator<Item = A>,
A: AsRef<str>,
{
searchable.is_none_or(|fields| {
fields
.into_iter()
.any(|attr| match_field_legacy(attr.as_ref(), field_name) == PatternMatch::Parent)
})
}

View File

@@ -8,10 +8,7 @@ use crate::update::new::document::Document;
use crate::update::new::extract::perm_json_p::{ use crate::update::new::extract::perm_json_p::{
seek_leaf_values_in_array, seek_leaf_values_in_object, Depth, seek_leaf_values_in_array, seek_leaf_values_in_object, Depth,
}; };
use crate::{ use crate::{FieldId, InternalError, LocalizedAttributesRule, Result, MAX_WORD_LENGTH};
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
MAX_WORD_LENGTH,
};
// todo: should be crate::proximity::MAX_DISTANCE but it has been forgotten // todo: should be crate::proximity::MAX_DISTANCE but it has been forgotten
const MAX_DISTANCE: u32 = 8; const MAX_DISTANCE: u32 = 8;
@@ -26,26 +23,25 @@ impl DocumentTokenizer<'_> {
pub fn tokenize_document<'doc>( pub fn tokenize_document<'doc>(
&self, &self,
document: impl Document<'doc>, document: impl Document<'doc>,
field_id_map: &mut GlobalFieldsIdsMap, should_tokenize: &mut impl FnMut(&str) -> Result<(FieldId, PatternMatch)>,
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
let mut field_position = HashMap::new(); let mut field_position = HashMap::new();
let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else {
return Err(UserError::AttributeLimitReached.into());
};
if meta.is_searchable() {
self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?;
}
// todo: should be a match on the field_name using `match_field_legacy` function,
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
Ok(PatternMatch::Match)
};
for entry in document.iter_top_level_fields() { for entry in document.iter_top_level_fields() {
let (field_name, value) = entry?; let (field_name, value) = entry?;
if let (_, PatternMatch::NoMatch) = should_tokenize(field_name)? {
continue;
}
let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
let (fid, pattern_match) = should_tokenize(field_name)?;
if pattern_match == PatternMatch::Match {
self.tokenize_field(fid, field_name, value, token_fn, &mut field_position)?;
}
Ok(pattern_match)
};
// parse json. // parse json.
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
Value::Object(object) => seek_leaf_values_in_object( Value::Object(object) => seek_leaf_values_in_object(
@@ -192,7 +188,7 @@ mod test {
use super::*; use super::*;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::update::new::document::{DocumentFromVersions, Versions}; use crate::update::new::document::{DocumentFromVersions, Versions};
use crate::FieldsIdsMap; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, UserError};
#[test] #[test]
fn test_tokenize_document() { fn test_tokenize_document() {
@@ -231,6 +227,7 @@ mod test {
Default::default(), Default::default(),
Default::default(), Default::default(),
Default::default(), Default::default(),
Default::default(),
None, None,
None, None,
Default::default(), Default::default(),
@@ -251,15 +248,19 @@ mod test {
let document = Versions::single(document); let document = Versions::single(document);
let document = DocumentFromVersions::new(&document); let document = DocumentFromVersions::new(&document);
let mut should_tokenize = |field_name: &str| {
let Some(field_id) = global_fields_ids_map.id_or_insert(field_name) else {
return Err(UserError::AttributeLimitReached.into());
};
Ok((field_id, PatternMatch::Match))
};
document_tokenizer document_tokenizer
.tokenize_document( .tokenize_document(document, &mut should_tokenize, &mut |_fname, fid, pos, word| {
document, words.insert([fid, pos], word.to_string());
&mut global_fields_ids_map, Ok(())
&mut |_fname, fid, pos, word| { })
words.insert([fid, pos], word.to_string());
Ok(())
},
)
.unwrap(); .unwrap();
snapshot!(format!("{:#?}", words), @r###" snapshot!(format!("{:#?}", words), @r###"

View File

@@ -1,5 +1,6 @@
use std::cell::RefCell; use std::cell::RefCell;
use std::fmt::Debug; use std::fmt::Debug;
use std::sync::RwLock;
use bumpalo::collections::Vec as BVec; use bumpalo::collections::Vec as BVec;
use bumpalo::Bump; use bumpalo::Bump;
@@ -27,7 +28,10 @@ use crate::vector::extractor::{
use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed}; use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed};
use crate::vector::settings::ReindexAction; use crate::vector::settings::ReindexAction;
use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment}; use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment};
use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; use crate::{
DocumentId, FieldDistribution, GlobalFieldsIdsMap, InternalError, Result, ThreadPoolNoAbort,
UserError,
};
pub struct EmbeddingExtractor<'a, 'b> { pub struct EmbeddingExtractor<'a, 'b> {
embedders: &'a RuntimeEmbedders, embedders: &'a RuntimeEmbedders,
@@ -321,6 +325,15 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
let old_embedders = self.settings_delta.old_embedders(); let old_embedders = self.settings_delta.old_embedders();
let unused_vectors_distribution = UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let unused_vectors_distribution = UnusedVectorsDistributionBump::new_in(&context.doc_alloc);
// We get a reference to the new and old fields ids maps but
// note that those are local versions where updates to them
// will not be reflected in the database. It's not an issue
// because new settings do not generate new fields.
let new_fields_ids_map = RwLock::new(self.settings_delta.new_fields_ids_map().clone());
let new_fields_ids_map = RefCell::new(GlobalFieldsIdsMap::new(&new_fields_ids_map));
let old_fields_ids_map = RwLock::new(self.settings_delta.old_fields_ids_map().clone());
let old_fields_ids_map = RefCell::new(GlobalFieldsIdsMap::new(&old_fields_ids_map));
let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
let embedder_configs = context.index.embedding_configs(); let embedder_configs = context.index.embedding_configs();
for (embedder_name, action) in self.settings_delta.embedder_actions().iter() { for (embedder_name, action) in self.settings_delta.embedder_actions().iter() {
@@ -396,6 +409,7 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
if !must_regenerate { if !must_regenerate {
continue; continue;
} }
// we need to regenerate the prompts for the document // we need to regenerate the prompts for the document
chunks.settings_change_autogenerated( chunks.settings_change_autogenerated(
document.docid(), document.docid(),
@@ -406,7 +420,8 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
context.db_fields_ids_map, context.db_fields_ids_map,
)?, )?,
self.settings_delta, self.settings_delta,
context.new_fields_ids_map, &old_fields_ids_map,
&new_fields_ids_map,
&unused_vectors_distribution, &unused_vectors_distribution,
old_is_user_provided, old_is_user_provided,
fragments_changed, fragments_changed,
@@ -442,7 +457,8 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
context.db_fields_ids_map, context.db_fields_ids_map,
)?, )?,
self.settings_delta, self.settings_delta,
context.new_fields_ids_map, &old_fields_ids_map,
&new_fields_ids_map,
&unused_vectors_distribution, &unused_vectors_distribution,
old_is_user_provided, old_is_user_provided,
true, true,
@@ -638,7 +654,8 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
external_docid: &'a str, external_docid: &'a str,
document: D, document: D,
settings_delta: &SD, settings_delta: &SD,
fields_ids_map: &'a RefCell<crate::GlobalFieldsIdsMap>, old_fields_ids_map: &'a RefCell<GlobalFieldsIdsMap<'a>>,
new_fields_ids_map: &'a RefCell<GlobalFieldsIdsMap<'a>>,
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
old_is_user_provided: bool, old_is_user_provided: bool,
full_reindex: bool, full_reindex: bool,
@@ -733,10 +750,17 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
old_embedder.as_ref().map(|old_embedder| &old_embedder.document_template) old_embedder.as_ref().map(|old_embedder| &old_embedder.document_template)
}; };
let extractor = let extractor = DocumentTemplateExtractor::new(
DocumentTemplateExtractor::new(document_template, doc_alloc, fields_ids_map); document_template,
doc_alloc,
new_fields_ids_map,
);
let old_extractor = old_document_template.map(|old_document_template| { let old_extractor = old_document_template.map(|old_document_template| {
DocumentTemplateExtractor::new(old_document_template, doc_alloc, fields_ids_map) DocumentTemplateExtractor::new(
old_document_template,
doc_alloc,
old_fields_ids_map,
)
}); });
let metadata = let metadata =
Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; Metadata { docid, external_docid, extractor_id: extractor.extractor_id() };

View File

@@ -372,11 +372,10 @@ where
SD: SettingsDelta + Sync, SD: SettingsDelta + Sync,
{ {
// Create the list of document ids to extract // Create the list of document ids to extract
let rtxn = indexing_context.index.read_txn()?; let index = indexing_context.index;
let all_document_ids = let rtxn = index.read_txn()?;
indexing_context.index.documents_ids(&rtxn)?.into_iter().collect::<Vec<_>>(); let all_document_ids = index.documents_ids(&rtxn)?.into_iter().collect::<Vec<_>>();
let primary_key = let primary_key = primary_key_from_db(index, &rtxn, &indexing_context.db_fields_ids_map)?;
primary_key_from_db(indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?;
let documents = DocumentsIndentifiers::new(&all_document_ids, primary_key); let documents = DocumentsIndentifiers::new(&all_document_ids, primary_key);
let span = let span =
@@ -391,6 +390,133 @@ where
extractor_allocs, extractor_allocs,
)?; )?;
{
let WordDocidsCaches {
word_docids,
word_fid_docids,
exact_word_docids,
word_position_docids,
fid_word_count_docids,
} = {
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
let _entered = span.enter();
SettingsChangeWordDocidsExtractors::run_extraction(
settings_delta,
&documents,
indexing_context,
extractor_allocs,
IndexingStep::ExtractingWords,
)?
};
indexing_context.progress.update_progress(IndexingStep::MergingWordCaches);
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
let _entered = span.enter();
indexing_context.progress.update_progress(MergingWordCache::WordDocids);
merge_and_send_docids(
word_docids,
index.word_docids.remap_types(),
index,
extractor_sender.docids::<WordDocids>(),
&indexing_context.must_stop_processing,
)?;
}
{
let span =
tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
let _entered = span.enter();
indexing_context.progress.update_progress(MergingWordCache::WordFieldIdDocids);
merge_and_send_docids(
word_fid_docids,
index.word_fid_docids.remap_types(),
index,
extractor_sender.docids::<WordFidDocids>(),
&indexing_context.must_stop_processing,
)?;
}
{
let span =
tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
let _entered = span.enter();
indexing_context.progress.update_progress(MergingWordCache::ExactWordDocids);
merge_and_send_docids(
exact_word_docids,
index.exact_word_docids.remap_types(),
index,
extractor_sender.docids::<ExactWordDocids>(),
&indexing_context.must_stop_processing,
)?;
}
{
let span =
tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
let _entered = span.enter();
indexing_context.progress.update_progress(MergingWordCache::WordPositionDocids);
merge_and_send_docids(
word_position_docids,
index.word_position_docids.remap_types(),
index,
extractor_sender.docids::<WordPositionDocids>(),
&indexing_context.must_stop_processing,
)?;
}
{
let span =
tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
let _entered = span.enter();
indexing_context.progress.update_progress(MergingWordCache::FieldIdWordCountDocids);
merge_and_send_docids(
fid_word_count_docids,
index.field_id_word_count_docids.remap_types(),
index,
extractor_sender.docids::<FidWordCountDocids>(),
&indexing_context.must_stop_processing,
)?;
}
}
// Run the proximity extraction only if the precision is ByWord.
let new_proximity_precision = settings_delta.new_proximity_precision();
if *new_proximity_precision == ProximityPrecision::ByWord {
let caches = {
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
let _entered = span.enter();
SettingsChangeWordPairProximityDocidsExtractors::run_extraction(
settings_delta,
&documents,
indexing_context,
extractor_allocs,
IndexingStep::ExtractingWordProximity,
)?
};
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
let _entered = span.enter();
indexing_context.progress.update_progress(IndexingStep::MergingWordProximity);
merge_and_send_docids(
caches,
index.word_pair_proximity_docids.remap_types(),
index,
extractor_sender.docids::<WordPairProximityDocids>(),
&indexing_context.must_stop_processing,
)?;
}
}
'vectors: { 'vectors: {
if settings_delta.embedder_actions().is_empty() { if settings_delta.embedder_actions().is_empty() {
break 'vectors; break 'vectors;

View File

@@ -1,4 +1,4 @@
use std::collections::BTreeMap; use std::collections::{BTreeMap, BTreeSet};
use std::sync::atomic::AtomicBool; use std::sync::atomic::AtomicBool;
use std::sync::{Arc, Once, RwLock}; use std::sync::{Arc, Once, RwLock};
use std::thread::{self, Builder}; use std::thread::{self, Builder};
@@ -8,9 +8,11 @@ use document_changes::{DocumentChanges, IndexingContext};
pub use document_deletion::DocumentDeletion; pub use document_deletion::DocumentDeletion;
pub use document_operation::{DocumentOperation, PayloadStats}; pub use document_operation::{DocumentOperation, PayloadStats};
use hashbrown::HashMap; use hashbrown::HashMap;
use heed::{RoTxn, RwTxn}; use heed::types::DecodeIgnore;
use heed::{BytesDecode, Database, RoTxn, RwTxn};
pub use partial_dump::PartialDump; pub use partial_dump::PartialDump;
pub use post_processing::recompute_word_fst_from_word_docids_database; pub use post_processing::recompute_word_fst_from_word_docids_database;
pub use settings_changes::settings_change_extract;
pub use update_by_function::UpdateByFunction; pub use update_by_function::UpdateByFunction;
pub use write::ChannelCongestion; pub use write::ChannelCongestion;
use write::{build_vectors, update_index, write_to_db}; use write::{build_vectors, update_index, write_to_db};
@@ -20,12 +22,18 @@ use super::steps::IndexingStep;
use super::thread_local::ThreadLocal; use super::thread_local::ThreadLocal;
use crate::documents::PrimaryKey; use crate::documents::PrimaryKey;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::heed_codec::StrBEU16Codec;
use crate::progress::{EmbedderStats, Progress}; use crate::progress::{EmbedderStats, Progress};
use crate::proximity::ProximityPrecision;
use crate::update::new::steps::SettingsIndexerStep;
use crate::update::new::FacetFieldIdsDelta;
use crate::update::settings::SettingsDelta; use crate::update::settings::SettingsDelta;
use crate::update::GrenadParameters; use crate::update::GrenadParameters;
use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments}; use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments};
use crate::vector::{Embedder, RuntimeEmbedders, VectorStore}; use crate::vector::{Embedder, RuntimeEmbedders, VectorStore};
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; use crate::{
Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort,
};
#[cfg(not(feature = "enterprise"))] #[cfg(not(feature = "enterprise"))]
pub mod community_edition; pub mod community_edition;
@@ -242,6 +250,20 @@ where
SD: SettingsDelta + Sync, SD: SettingsDelta + Sync,
{ {
delete_old_embedders_and_fragments(wtxn, index, settings_delta)?; delete_old_embedders_and_fragments(wtxn, index, settings_delta)?;
delete_old_fid_based_databases(wtxn, index, settings_delta, must_stop_processing, progress)?;
// Clear word_pair_proximity if byWord to byAttribute
let old_proximity_precision = settings_delta.old_proximity_precision();
let new_proximity_precision = settings_delta.new_proximity_precision();
if *old_proximity_precision == ProximityPrecision::ByWord
&& *new_proximity_precision == ProximityPrecision::ByAttribute
{
index.word_pair_proximity_docids.clear(wtxn)?;
}
// TODO delete useless searchable databases
// - Clear fid_prefix_* in the post processing
// - clear the prefix + fid_prefix if setting `PrefixSearch` is enabled
let mut bbbuffers = Vec::new(); let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false); let finished_extraction = AtomicBool::new(false);
@@ -300,6 +322,8 @@ where
.unwrap() .unwrap()
})?; })?;
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
let new_embedders = settings_delta.new_embedders(); let new_embedders = settings_delta.new_embedders();
let embedder_actions = settings_delta.embedder_actions(); let embedder_actions = settings_delta.embedder_actions();
let index_embedder_category_ids = settings_delta.new_embedder_category_id(); let index_embedder_category_ids = settings_delta.new_embedder_category_id();
@@ -334,6 +358,18 @@ where
}) })
.unwrap()?; .unwrap()?;
pool.install(|| {
// WARN When implementing the facets don't forget this
let facet_field_ids_delta = FacetFieldIdsDelta::new(0, 0);
post_processing::post_process(
indexing_context,
wtxn,
global_fields_ids_map,
facet_field_ids_delta,
)
})
.unwrap()?;
indexing_context.progress.update_progress(IndexingStep::BuildingGeoJson); indexing_context.progress.update_progress(IndexingStep::BuildingGeoJson);
index.cellulite.build( index.cellulite.build(
wtxn, wtxn,
@@ -463,6 +499,106 @@ where
Ok(()) Ok(())
} }
/// Deletes entries refering the provided
/// fids from the fid-based databases.
fn delete_old_fid_based_databases<SD, MSP>(
wtxn: &mut RwTxn<'_>,
index: &Index,
settings_delta: &SD,
must_stop_processing: &MSP,
progress: &Progress,
) -> Result<()>
where
SD: SettingsDelta + Sync,
MSP: Fn() -> bool + Sync,
{
let fids_to_delete: Option<BTreeSet<_>> = {
let rtxn = index.read_txn()?;
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let old_searchable_attributes = settings_delta.old_searchable_attributes().as_ref();
let new_searchable_attributes = settings_delta.new_searchable_attributes().as_ref();
old_searchable_attributes.zip(new_searchable_attributes).map(|(old, new)| {
old.iter()
// Ignore the field if it is not searchable anymore
// or if it was never referenced in any document
.filter_map(|name| if new.contains(name) { None } else { fields_ids_map.id(name) })
.collect()
})
};
let Some(fids_to_delete) = fids_to_delete else {
return Ok(());
};
progress.update_progress(SettingsIndexerStep::DeletingOldWordFidDocids);
delete_old_word_fid_docids(wtxn, index.word_fid_docids, must_stop_processing, &fids_to_delete)?;
progress.update_progress(SettingsIndexerStep::DeletingOldFidWordCountDocids);
delete_old_fid_word_count_docids(wtxn, index, must_stop_processing, &fids_to_delete)?;
progress.update_progress(SettingsIndexerStep::DeletingOldWordPrefixFidDocids);
delete_old_word_fid_docids(
wtxn,
index.word_prefix_fid_docids,
must_stop_processing,
&fids_to_delete,
)?;
Ok(())
}
fn delete_old_word_fid_docids<'txn, MSP, DC>(
wtxn: &mut RwTxn<'txn>,
database: Database<StrBEU16Codec, DC>,
must_stop_processing: &MSP,
fids_to_delete: &BTreeSet<u16>,
) -> Result<(), Error>
where
MSP: Fn() -> bool + Sync,
DC: BytesDecode<'txn>,
{
let mut iter = database.iter_mut(wtxn)?.remap_data_type::<DecodeIgnore>();
while let Some(((_word, fid), ())) = iter.next().transpose()? {
// TODO should I call it that often?
if must_stop_processing() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
if fids_to_delete.contains(&fid) {
// safety: We don't keep any references to the data.
unsafe { iter.del_current()? };
}
}
Ok(())
}
fn delete_old_fid_word_count_docids<MSP>(
wtxn: &mut RwTxn<'_>,
index: &Index,
must_stop_processing: &MSP,
fids_to_delete: &BTreeSet<u16>,
) -> Result<(), Error>
where
MSP: Fn() -> bool + Sync,
{
let db = index.field_id_word_count_docids.remap_data_type::<DecodeIgnore>();
for &fid_to_delete in fids_to_delete {
if must_stop_processing() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
let mut iter = db.prefix_iter_mut(wtxn, &(fid_to_delete, 0))?;
while let Some(((fid, _word_count), ())) = iter.next().transpose()? {
debug_assert_eq!(fid, fid_to_delete);
// safety: We don't keep any references to the data.
unsafe { iter.del_current()? };
}
}
Ok(())
}
fn indexer_memory_settings( fn indexer_memory_settings(
current_num_threads: usize, current_num_threads: usize,
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,

View File

@@ -28,6 +28,9 @@ make_enum_progress! {
ChangingVectorStore, ChangingVectorStore,
UsingStableIndexer, UsingStableIndexer,
UsingExperimentalIndexer, UsingExperimentalIndexer,
DeletingOldWordFidDocids,
DeletingOldFidWordCountDocids,
DeletingOldWordPrefixFidDocids,
} }
} }

View File

@@ -1589,33 +1589,33 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
// only use the new indexer when only the embedder possibly changed // only use the new indexer when only the embedder possibly changed
if let Self { if let Self {
searchable_fields: Setting::NotSet, searchable_fields: _,
displayed_fields: Setting::NotSet, displayed_fields: Setting::NotSet,
filterable_fields: Setting::NotSet, filterable_fields: Setting::NotSet,
sortable_fields: Setting::NotSet, sortable_fields: Setting::NotSet,
criteria: Setting::NotSet, criteria: Setting::NotSet,
stop_words: Setting::NotSet, stop_words: Setting::NotSet, // TODO (require force reindexing of searchables)
non_separator_tokens: Setting::NotSet, non_separator_tokens: Setting::NotSet, // TODO (require force reindexing of searchables)
separator_tokens: Setting::NotSet, separator_tokens: Setting::NotSet, // TODO (require force reindexing of searchables)
dictionary: Setting::NotSet, dictionary: Setting::NotSet, // TODO (require force reindexing of searchables)
distinct_field: Setting::NotSet, distinct_field: Setting::NotSet,
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
primary_key: Setting::NotSet, primary_key: Setting::NotSet,
authorize_typos: Setting::NotSet, authorize_typos: Setting::NotSet,
min_word_len_two_typos: Setting::NotSet, min_word_len_two_typos: Setting::NotSet,
min_word_len_one_typo: Setting::NotSet, min_word_len_one_typo: Setting::NotSet,
exact_words: Setting::NotSet, exact_words: Setting::NotSet, // TODO (require force reindexing of searchables)
exact_attributes: Setting::NotSet, exact_attributes: _,
max_values_per_facet: Setting::NotSet, max_values_per_facet: Setting::NotSet,
sort_facet_values_by: Setting::NotSet, sort_facet_values_by: Setting::NotSet,
pagination_max_total_hits: Setting::NotSet, pagination_max_total_hits: Setting::NotSet,
proximity_precision: Setting::NotSet, proximity_precision: _,
embedder_settings: _, embedder_settings: _,
search_cutoff: Setting::NotSet, search_cutoff: Setting::NotSet,
localized_attributes_rules: Setting::NotSet, localized_attributes_rules: Setting::NotSet, // TODO to start with
prefix_search: Setting::NotSet, prefix_search: Setting::NotSet, // TODO continue with this
facet_search: Setting::NotSet, facet_search: Setting::NotSet,
disable_on_numbers: Setting::NotSet, disable_on_numbers: Setting::NotSet, // TODO (require force reindexing of searchables)
chat: Setting::NotSet, chat: Setting::NotSet,
vector_store: Setting::NotSet, vector_store: Setting::NotSet,
wtxn: _, wtxn: _,
@@ -1632,10 +1632,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
// Update index settings // Update index settings
let embedding_config_updates = self.update_embedding_configs()?; let embedding_config_updates = self.update_embedding_configs()?;
self.update_user_defined_searchable_attributes()?; self.update_user_defined_searchable_attributes()?;
self.update_exact_attributes()?;
self.update_proximity_precision()?;
let mut new_inner_settings = // Note that we don't need to update the searchables here,
InnerIndexSettings::from_index(self.index, self.wtxn, None)?; // as it will be done after the settings update.
new_inner_settings.recompute_searchables(self.wtxn, self.index)?; let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
let primary_key_id = self let primary_key_id = self
.index .index
@@ -2062,9 +2064,12 @@ impl InnerIndexSettings {
let sortable_fields = index.sortable_fields(rtxn)?; let sortable_fields = index.sortable_fields(rtxn)?;
let asc_desc_fields = index.asc_desc_fields(rtxn)?; let asc_desc_fields = index.asc_desc_fields(rtxn)?;
let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string()); let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string());
let user_defined_searchable_attributes = index let user_defined_searchable_attributes = match index.user_defined_searchable_fields(rtxn)? {
.user_defined_searchable_fields(rtxn)? Some(fields) if fields.contains(&"*") => None,
.map(|fields| fields.into_iter().map(|f| f.to_string()).collect()); Some(fields) => Some(fields.into_iter().map(|f| f.to_string()).collect()),
None => None,
};
let builder = MetadataBuilder::from_index(index, rtxn)?; let builder = MetadataBuilder::from_index(index, rtxn)?;
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?; let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
@@ -2578,8 +2583,20 @@ fn deserialize_sub_embedder(
/// Implement this trait for the settings delta type. /// Implement this trait for the settings delta type.
/// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`. /// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`.
pub trait SettingsDelta { pub trait SettingsDelta {
fn new_embedders(&self) -> &RuntimeEmbedders; fn old_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
fn old_searchable_attributes(&self) -> &Option<Vec<String>>;
fn new_searchable_attributes(&self) -> &Option<Vec<String>>;
fn old_disabled_typos_terms(&self) -> &DisabledTyposTerms;
fn new_disabled_typos_terms(&self) -> &DisabledTyposTerms;
fn old_proximity_precision(&self) -> &ProximityPrecision;
fn new_proximity_precision(&self) -> &ProximityPrecision;
fn old_embedders(&self) -> &RuntimeEmbedders; fn old_embedders(&self) -> &RuntimeEmbedders;
fn new_embedders(&self) -> &RuntimeEmbedders;
fn new_embedder_category_id(&self) -> &HashMap<String, u8>; fn new_embedder_category_id(&self) -> &HashMap<String, u8>;
fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction>; fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction>;
fn try_for_each_fragment_diff<F, E>( fn try_for_each_fragment_diff<F, E>(
@@ -2589,7 +2606,6 @@ pub trait SettingsDelta {
) -> std::result::Result<(), E> ) -> std::result::Result<(), E>
where where
F: FnMut(FragmentDiff) -> std::result::Result<(), E>; F: FnMut(FragmentDiff) -> std::result::Result<(), E>;
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
} }
pub struct FragmentDiff<'a> { pub struct FragmentDiff<'a> {
@@ -2598,26 +2614,47 @@ pub struct FragmentDiff<'a> {
} }
impl SettingsDelta for InnerIndexSettingsDiff { impl SettingsDelta for InnerIndexSettingsDiff {
fn new_embedders(&self) -> &RuntimeEmbedders { fn old_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
&self.new.runtime_embedders &self.old.fields_ids_map
}
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
&self.new.fields_ids_map
}
fn old_searchable_attributes(&self) -> &Option<Vec<String>> {
&self.old.user_defined_searchable_attributes
}
fn new_searchable_attributes(&self) -> &Option<Vec<String>> {
&self.new.user_defined_searchable_attributes
}
fn old_disabled_typos_terms(&self) -> &DisabledTyposTerms {
&self.old.disabled_typos_terms
}
fn new_disabled_typos_terms(&self) -> &DisabledTyposTerms {
&self.new.disabled_typos_terms
}
fn old_proximity_precision(&self) -> &ProximityPrecision {
&self.old.proximity_precision
}
fn new_proximity_precision(&self) -> &ProximityPrecision {
&self.new.proximity_precision
} }
fn old_embedders(&self) -> &RuntimeEmbedders { fn old_embedders(&self) -> &RuntimeEmbedders {
&self.old.runtime_embedders &self.old.runtime_embedders
} }
fn new_embedders(&self) -> &RuntimeEmbedders {
&self.new.runtime_embedders
}
fn new_embedder_category_id(&self) -> &HashMap<String, u8> { fn new_embedder_category_id(&self) -> &HashMap<String, u8> {
&self.new.embedder_category_id &self.new.embedder_category_id
} }
fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction> { fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction> {
&self.embedding_config_updates &self.embedding_config_updates
} }
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
&self.new.fields_ids_map
}
fn try_for_each_fragment_diff<F, E>( fn try_for_each_fragment_diff<F, E>(
&self, &self,
embedder_name: &str, embedder_name: &str,

View File

@@ -14,28 +14,21 @@ fn set_and_reset_searchable_fields() {
let index = TempIndex::new(); let index = TempIndex::new();
// First we send 3 documents with ids from 1 to 3. // First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
index index
.add_documents_using_wtxn( .add_documents(documents!([
&mut wtxn, { "id": 1, "name": "kevin", "age": 23 },
documents!([ { "id": 2, "name": "kevina", "age": 21},
{ "id": 1, "name": "kevin", "age": 23 }, { "id": 3, "name": "benoit", "age": 34 }
{ "id": 2, "name": "kevina", "age": 21}, ]))
{ "id": 3, "name": "benoit", "age": 34 }
]),
)
.unwrap(); .unwrap();
// We change the searchable fields to be the "name" field only. // We change the searchable fields to be the "name" field only.
index index
.update_settings_using_wtxn(&mut wtxn, |settings| { .update_settings(|settings| {
settings.set_searchable_fields(vec!["name".into()]); settings.set_searchable_fields(vec!["name".into()]);
}) })
.unwrap(); .unwrap();
wtxn.commit().unwrap();
db_snap!(index, fields_ids_map, @r###" db_snap!(index, fields_ids_map, @r###"
0 id | 0 id |
1 name | 1 name |