Compare commits

...

23 Commits

Author SHA1 Message Date
Clément Renault
80d7aa7bdc Use a patch version of arroy to disable simd 2025-06-26 16:54:18 +02:00
Louis Dureuil
94b43001db Merge pull request #5492 from meilisearch/accept-cancelation-tasks-when-disk-full
make meilisearch accept cancelation tasks even when the disk is full
2025-04-03 15:46:46 +00:00
Tamo
796a325972 Fix typos
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-04-03 15:53:42 +02:00
Tamo
1db550ec7f make meilisearch accept cancelation tasks even when the disk is full 2025-04-03 15:47:56 +02:00
Louis Dureuil
418fa47963 Merge pull request #5313 from barloes/fixRankingScoreThresholdRankingIssue
fix for rankingScoreThreshold changes the results' ranking
2025-04-01 13:10:55 +00:00
Louis Dureuil
0656a0d515 Optimize roaring operation
Co-authored-by: Many the fish <many@meilisearch.com>
2025-04-01 14:25:27 +02:00
Tamo
e36a8c50b9 Merge pull request #5478 from meilisearch/enforce-embedding-dimensions
Enforce embedding dimensions
2025-03-31 15:31:29 +00:00
Louis Dureuil
08ff135ad6 Fix test 2025-03-31 15:27:49 +02:00
Louis Dureuil
f729864466 Check dimension mismatch at insertion time 2025-03-31 15:27:49 +02:00
Louis Dureuil
94ea263bef Add new error for dimensions mismatch during indexing 2025-03-31 15:27:49 +02:00
Tamo
0e475cb5e6 fix warn and show what meilisearch understood of the vectors in the cursed test 2025-03-31 13:49:22 +02:00
vuthanhtung2412
62de70b73c Document problematic case in test and acknowledge PR comment 2025-03-31 13:49:22 +02:00
vuthanhtung2412
7707fb18dd add embedding with dimension mismatch test case 2025-03-31 13:49:22 +02:00
Clément Renault
bb2e9419d3 Merge pull request #5468 from meilisearch/more-precise-post-processing
More Precise Post Processing
2025-03-27 10:07:09 +00:00
Clément Renault
cf68713145 Merge pull request #5465 from meilisearch/improve-stats-perf
Improve documents stats performances
2025-03-27 09:20:14 +00:00
Kerollmops
811143cbe9 Add more progress precision when doing post processing 2025-03-27 10:17:28 +01:00
Kerollmops
c670e9a39b Make sure the snaps are happy 2025-03-26 20:03:35 +01:00
Clément Renault
65f1b13475 Merge pull request #5464 from meilisearch/camel-case-database-sizes
Prefer camelCase for internal database sizes db name
2025-03-26 16:40:39 +00:00
Kerollmops
db7ce03763 Improve the performances of computing the size of the documents database 2025-03-26 17:40:12 +01:00
Kerollmops
7ed9adde29 Prefer camelCase for internal database sizes db name 2025-03-26 16:45:52 +01:00
Louis Dureuil
f9807ba32e Fix logic when results are below the threshold 2025-03-19 11:34:53 +01:00
Tee Jun hui
8c8cc59a6c remove new line added by accident 2025-03-19 11:34:53 +01:00
Tee Jun hui
f540a69ac3 add 1 to index so it points to correct position 2025-03-19 11:34:52 +01:00
23 changed files with 452 additions and 217 deletions

5
Cargo.lock generated
View File

@@ -1,6 +1,6 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
version = 4
[[package]]
name = "actix-codec"
@@ -394,8 +394,7 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
[[package]]
name = "arroy"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
source = "git+https://github.com/meilisearch/arroy?branch=no-simd-x86-arroy#2ebbed058a6e3292707486e5a57d754d94f3fa2a"
dependencies = [
"bytemuck",
"byteorder",

View File

@@ -625,8 +625,8 @@ impl IndexScheduler {
task_id: Option<TaskId>,
dry_run: bool,
) -> Result<Task> {
// if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
// if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incoming task
if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty())
&& (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40
{
return Err(Error::NoSpaceLeftInTaskQueue);

View File

@@ -292,8 +292,6 @@ impl Queue {
return Ok(task);
}
// Get rid of the mutability.
let task = task;
self.tasks.register(wtxn, &task)?;
Ok(task)

View File

@@ -364,7 +364,7 @@ fn test_task_queue_is_full() {
// we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
// Even the task deletion that doesn't delete anything shouldn't be accepted
// Even the task deletion and cancelation that don't delete anything should be refused
let result = index_scheduler
.register(
KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() },
@@ -373,10 +373,39 @@ fn test_task_queue_is_full() {
)
.unwrap_err();
snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
let result = index_scheduler
.register(
KindWithContent::TaskCancelation { query: S("test"), tasks: RoaringBitmap::new() },
None,
false,
)
.unwrap_err();
snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
// we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
// But a task deletion that delete something should works
// But a task cancelation that cancel something should work
index_scheduler
.register(
KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() },
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
// But we should still be forbidden from enqueuing new tasks
let result = index_scheduler
.register(
KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None },
None,
false,
)
.unwrap_err();
snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
// And a task deletion that delete something should works
index_scheduler
.register(
KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() },

View File

@@ -20,6 +20,7 @@ use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::sync::Arc;
use convert_case::{Case, Casing as _};
use meilisearch_types::error::ResponseError;
use meilisearch_types::heed::{Env, WithoutTls};
use meilisearch_types::milli;
@@ -381,7 +382,10 @@ impl IndexScheduler {
Less => "-",
};
Some((dbname.to_string(), format!("{post:#.2} ({sign}{diff:#.2})").into()))
Some((
dbname.to_case(Case::Camel),
format!("{post:#.2} ({sign}{diff:#.2})").into(),
))
})
.into_iter()
.flatten()

View File

@@ -454,7 +454,10 @@ impl ErrorCode for milli::Error {
}
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::InvalidVectorDimensions { .. }
| UserError::InvalidIndexingVectorDimensions { .. } => {
Code::InvalidVectorDimensions
}
UserError::InvalidVectorsMapType { .. }
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
UserError::TooManyVectors(_, _) => Code::TooManyVectors,

View File

@@ -518,7 +518,7 @@ impl From<index_scheduler::IndexStats> for IndexStats {
.inner_stats
.number_of_documents
.unwrap_or(stats.inner_stats.documents_database_stats.number_of_entries()),
raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(),
raw_document_db_size: stats.inner_stats.documents_database_stats.total_size(),
avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(),
is_indexing: stats.is_indexing,
number_of_embeddings: stats.inner_stats.number_of_embeddings,

View File

@@ -157,11 +157,14 @@ async fn delete_document_by_filter() {
index.wait_task(task.uid()).await.succeeded();
let (stats, _) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 4,
"rawDocumentDbSize": 42,
"avgDocumentSize": 10,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -208,11 +211,14 @@ async fn delete_document_by_filter() {
"###);
let (stats, _) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 16,
"avgDocumentSize": 8,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -278,11 +284,14 @@ async fn delete_document_by_filter() {
"###);
let (stats, _) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 1,
"rawDocumentDbSize": 12,
"avgDocumentSize": 12,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@@ -28,12 +28,15 @@ async fn import_dump_v1_movie_raw() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -185,12 +188,15 @@ async fn import_dump_v1_movie_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -355,12 +361,15 @@ async fn import_dump_v1_rubygems_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -522,12 +531,15 @@ async fn import_dump_v2_movie_raw() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -679,12 +691,15 @@ async fn import_dump_v2_movie_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -846,12 +861,15 @@ async fn import_dump_v2_rubygems_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1010,12 +1028,15 @@ async fn import_dump_v3_movie_raw() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1167,12 +1188,15 @@ async fn import_dump_v3_movie_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1334,12 +1358,15 @@ async fn import_dump_v3_rubygems_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1498,12 +1525,15 @@ async fn import_dump_v4_movie_raw() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1655,12 +1685,15 @@ async fn import_dump_v4_movie_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1822,12 +1855,15 @@ async fn import_dump_v4_rubygems_with_settings() {
let (stats, code) = index.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1994,11 +2030,14 @@ async fn import_dump_v5() {
let (stats, code) = index1.stats().await;
snapshot!(code, @"200 OK");
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 10,
"rawDocumentDbSize": 6782,
"avgDocumentSize": 678,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -2031,12 +2070,15 @@ async fn import_dump_v5() {
let (stats, code) = index2.stats().await;
snapshot!(code, @"200 OK");
snapshot!(
json_string!(stats),
json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}),
@r###"
{
"numberOfDocuments": 10,
"rawDocumentDbSize": 6782,
"avgDocumentSize": 678,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@@ -110,11 +110,14 @@ async fn add_remove_embeddings() {
index.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 5,
"numberOfEmbeddedDocuments": 2,
@@ -135,11 +138,14 @@ async fn add_remove_embeddings() {
index.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 3,
"numberOfEmbeddedDocuments": 2,
@@ -160,11 +166,14 @@ async fn add_remove_embeddings() {
index.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 2,
"numberOfEmbeddedDocuments": 2,
@@ -186,11 +195,14 @@ async fn add_remove_embeddings() {
index.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 2,
"numberOfEmbeddedDocuments": 1,
@@ -236,11 +248,14 @@ async fn add_remove_embedded_documents() {
index.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 5,
"numberOfEmbeddedDocuments": 2,
@@ -257,11 +272,14 @@ async fn add_remove_embedded_documents() {
index.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 1,
"rawDocumentDbSize": 13,
"avgDocumentSize": 13,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 3,
"numberOfEmbeddedDocuments": 1,
@@ -290,11 +308,14 @@ async fn update_embedder_settings() {
index.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 108,
"avgDocumentSize": 54,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -326,11 +347,14 @@ async fn update_embedder_settings() {
server.wait_task(response.uid()).await.succeeded();
let (stats, _code) = index.stats().await;
snapshot!(json_string!(stats), @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[size]",
".avgDocumentSize" => "[size]",
}), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 108,
"avgDocumentSize": 54,
"rawDocumentDbSize": "[size]",
"avgDocumentSize": "[size]",
"isIndexing": false,
"numberOfEmbeddings": 3,
"numberOfEmbeddedDocuments": 2,

View File

@@ -133,7 +133,9 @@ async fn check_the_index_scheduler(server: &Server) {
let (stats, _) = server.stats().await;
assert_json_snapshot!(stats, {
".databaseSize" => "[bytes]",
".usedDatabaseSize" => "[bytes]"
".usedDatabaseSize" => "[bytes]",
".indexes.kefir.rawDocumentDbSize" => "[bytes]",
".indexes.kefir.avgDocumentSize" => "[bytes]",
},
@r###"
{
@@ -143,8 +145,8 @@ async fn check_the_index_scheduler(server: &Server) {
"indexes": {
"kefir": {
"numberOfDocuments": 1,
"rawDocumentDbSize": 109,
"avgDocumentSize": 109,
"rawDocumentDbSize": "[bytes]",
"avgDocumentSize": "[bytes]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -217,7 +219,9 @@ async fn check_the_index_scheduler(server: &Server) {
let (stats, _) = server.stats().await;
assert_json_snapshot!(stats, {
".databaseSize" => "[bytes]",
".usedDatabaseSize" => "[bytes]"
".usedDatabaseSize" => "[bytes]",
".indexes.kefir.rawDocumentDbSize" => "[bytes]",
".indexes.kefir.avgDocumentSize" => "[bytes]",
},
@r###"
{
@@ -227,8 +231,8 @@ async fn check_the_index_scheduler(server: &Server) {
"indexes": {
"kefir": {
"numberOfDocuments": 1,
"rawDocumentDbSize": 109,
"avgDocumentSize": 109,
"rawDocumentDbSize": "[bytes]",
"avgDocumentSize": "[bytes]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -245,11 +249,14 @@ async fn check_the_index_scheduler(server: &Server) {
"###);
let index = server.index("kefir");
let (stats, _) = index.stats().await;
snapshot!(stats, @r###"
snapshot!(json_string!(stats, {
".rawDocumentDbSize" => "[bytes]",
".avgDocumentSize" => "[bytes]",
}), @r###"
{
"numberOfDocuments": 1,
"rawDocumentDbSize": 109,
"avgDocumentSize": 109,
"rawDocumentDbSize": "[bytes]",
"avgDocumentSize": "[bytes]",
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@@ -164,6 +164,87 @@ async fn add_remove_user_provided() {
"###);
}
#[actix_rt::test]
async fn user_provide_mismatched_embedding_dimension() {
let server = Server::new().await;
let index = server.index("doggo");
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await.succeeded();
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": "[uid]",
"batchUid": "[batch_uid]",
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
"code": "invalid_vector_dimensions",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let new_document = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
]);
let (response, code) = index.add_documents(new_document, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": "[uid]",
"batchUid": "[batch_uid]",
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
"code": "invalid_vector_dimensions",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
async fn generate_default_user_provided_documents(server: &Server) -> Index {
let index = server.index("doggo");

View File

@@ -87,7 +87,7 @@ rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838
"no_time",
"sync",
] }
arroy = "0.6.1"
arroy = { git = "https://github.com/meilisearch/arroy", branch = "no-simd-x86-arroy" }
rand = "0.8.5"
tracing = "0.1.41"
ureq = { version = "2.12.1", features = ["json"] }

View File

@@ -1,8 +1,13 @@
use heed::types::Bytes;
use std::mem;
use heed::Database;
use heed::DatabaseStat;
use heed::RoTxn;
use heed::Unspecified;
use serde::{Deserialize, Serialize};
use crate::BEU32;
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
/// The stats of a database.
@@ -20,58 +25,24 @@ impl DatabaseStats {
///
/// This function iterates over the whole database and computes the stats.
/// It is not efficient and should be cached somewhere.
pub(crate) fn new(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'_>) -> heed::Result<Self> {
let mut database_stats =
Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 };
pub(crate) fn new(
database: Database<BEU32, Unspecified>,
rtxn: &RoTxn<'_>,
) -> heed::Result<Self> {
let DatabaseStat { page_size, depth: _, branch_pages, leaf_pages, overflow_pages, entries } =
database.stat(rtxn)?;
let mut iter = database.iter(rtxn)?;
while let Some((key, value)) = iter.next().transpose()? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
database_stats.total_key_size += key_size;
database_stats.total_value_size += value_size;
}
// We first take the total size without overflow pages as the overflow pages contains the values and only that.
let total_size = (branch_pages + leaf_pages + overflow_pages) * page_size as usize;
// We compute an estimated size for the keys.
let total_key_size = entries * (mem::size_of::<u32>() + 4);
let total_value_size = total_size - total_key_size;
database_stats.number_of_entries = database.len(rtxn)?;
Ok(database_stats)
}
/// Recomputes the stats of the database and returns the new stats.
///
/// This function is used to update the stats of the database when some keys are modified.
/// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states.
pub(crate) fn recompute<I, K>(
mut stats: Self,
database: Database<Bytes, Bytes>,
before_rtxn: &RoTxn<'_>,
after_rtxn: &RoTxn<'_>,
modified_keys: I,
) -> heed::Result<Self>
where
I: IntoIterator<Item = K>,
K: AsRef<[u8]>,
{
for key in modified_keys {
let key = key.as_ref();
if let Some(value) = database.get(after_rtxn, key)? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
stats.total_key_size = stats.total_key_size.saturating_add(key_size);
stats.total_value_size = stats.total_value_size.saturating_add(value_size);
}
if let Some(value) = database.get(before_rtxn, key)? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
stats.total_key_size = stats.total_key_size.saturating_sub(key_size);
stats.total_value_size = stats.total_value_size.saturating_sub(value_size);
}
}
stats.number_of_entries = database.len(after_rtxn)?;
Ok(stats)
Ok(Self {
number_of_entries: entries as u64,
total_key_size: total_key_size as u64,
total_value_size: total_value_size as u64,
})
}
pub fn average_key_size(&self) -> u64 {
@@ -86,6 +57,10 @@ impl DatabaseStats {
self.number_of_entries
}
pub fn total_size(&self) -> u64 {
self.total_key_size + self.total_value_size
}
pub fn total_key_size(&self) -> u64 {
self.total_key_size
}

View File

@@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string()
InvalidGeoField(#[from] GeoError),
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
InvalidVectorDimensions { expected: usize, found: usize },
#[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")]
InvalidIndexingVectorDimensions {
embedder_name: String,
document_id: String,
embedding_index: usize,
expected: usize,
found: usize,
},
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
InvalidVectorsMapType { document_id: String, value: Value },
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]

View File

@@ -411,38 +411,6 @@ impl Index {
Ok(count.unwrap_or_default())
}
/// Updates the stats of the documents database based on the previous stats and the modified docids.
pub fn update_documents_stats(
&self,
wtxn: &mut RwTxn<'_>,
modified_docids: roaring::RoaringBitmap,
) -> Result<()> {
let before_rtxn = self.read_txn()?;
let document_stats = match self.documents_stats(&before_rtxn)? {
Some(before_stats) => DatabaseStats::recompute(
before_stats,
self.documents.remap_types(),
&before_rtxn,
wtxn,
modified_docids.iter().map(|docid| docid.to_be_bytes()),
)?,
None => {
// This should never happen when there are already documents in the index, the documents stats should be present.
// If it happens, it means that the index was not properly initialized/upgraded.
debug_assert_eq!(
self.documents.len(&before_rtxn)?,
0,
"The documents stats should be present when there are documents in the index"
);
tracing::warn!("No documents stats found, creating new ones");
DatabaseStats::new(self.documents.remap_types(), &*wtxn)?
}
};
self.put_documents_stats(wtxn, document_stats)?;
Ok(())
}
/// Writes the stats of the documents database.
pub fn put_documents_stats(
&self,

View File

@@ -173,16 +173,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ranking_rule_scores.push(ScoreDetails::Skipped);
// remove candidates from the universe without adding them to result if their score is below the threshold
if let Some(ranking_score_threshold) = ranking_score_threshold {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
if current_score < ranking_score_threshold {
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
let is_below_threshold =
ranking_score_threshold.is_some_and(|ranking_score_threshold| {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
current_score < ranking_score_threshold
});
maybe_add_to_results!(bucket);
if is_below_threshold {
all_candidates -= &bucket;
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
} else {
maybe_add_to_results!(bucket);
}
ranking_rule_scores.pop();
@@ -237,23 +239,24 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
);
// remove candidates from the universe without adding them to result if their score is below the threshold
if let Some(ranking_score_threshold) = ranking_score_threshold {
let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
if current_score < ranking_score_threshold {
all_candidates -=
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
current_score < ranking_score_threshold
});
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
if cur_ranking_rule_index == ranking_rules_len - 1
|| (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|| is_below_threshold
{
maybe_add_to_results!(next_bucket.candidates);
if is_below_threshold {
all_candidates -= &next_bucket.candidates;
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
} else {
maybe_add_to_results!(next_bucket.candidates);
}
ranking_rule_scores.pop();
continue;
}

View File

@@ -28,6 +28,7 @@ pub use self::helpers::*;
pub use self::transform::{Transform, TransformOutput};
use super::facet::clear_facet_levels_based_on_settings_diff;
use super::new::StdResult;
use crate::database_stats::DatabaseStats;
use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError};
use crate::index::{PrefixSearch, PrefixSettings};
@@ -476,7 +477,8 @@ where
if !settings_diff.settings_update_only {
// Update the stats of the documents database when there is a document update.
self.index.update_documents_stats(self.wtxn, modified_docids)?;
let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?;
self.index.put_documents_stats(self.wtxn, stats)?;
}
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;

View File

@@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
// do we have set embeddings?
if let Some(embeddings) = new_vectors.embeddings {
chunks.set_vectors(
update.external_document_id(),
update.docid(),
embeddings
.into_vec(&context.doc_alloc, embedder_name)
@@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
document_id: update.external_document_id().to_string(),
error: error.to_string(),
})?,
);
)?;
} else if new_vectors.regenerate {
let new_rendered = prompt.render_document(
update.external_document_id(),
@@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
if let Some(embeddings) = new_vectors.embeddings {
chunks.set_vectors(
insertion.external_document_id(),
insertion.docid(),
embeddings
.into_vec(&context.doc_alloc, embedder_name)
@@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
.to_string(),
error: error.to_string(),
})?,
);
)?;
} else if new_vectors.regenerate {
let rendered = prompt.render_document(
insertion.external_document_id(),
@@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
embedder: &'a Embedder,
embedder_id: u8,
embedder_name: &'a str,
dimensions: usize,
prompt: &'a Prompt,
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
@@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
let texts = BVec::with_capacity_in(capacity, doc_alloc);
let ids = BVec::with_capacity_in(capacity, doc_alloc);
let dimensions = embedder.dimensions();
Self {
texts,
ids,
@@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
embedder_name,
user_provided,
has_manual_generation: None,
dimensions,
}
}
@@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
}
}
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
fn set_vectors(
&self,
external_docid: &'a str,
docid: DocumentId,
embeddings: Vec<Embedding>,
) -> Result<()> {
for (embedding_index, embedding) in embeddings.iter().enumerate() {
if embedding.len() != self.dimensions {
return Err(UserError::InvalidIndexingVectorDimensions {
expected: self.dimensions,
found: embedding.len(),
embedder_name: self.embedder_name.to_string(),
document_id: external_docid.to_string(),
embedding_index,
}
.into());
}
}
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
Ok(())
}
}

View File

@@ -234,7 +234,6 @@ where
embedders,
field_distribution,
document_ids,
modified_docids,
)?;
Ok(congestion)

View File

@@ -7,12 +7,13 @@ use itertools::{merge_join_by, EitherOrBoth};
use super::document_changes::IndexingContext;
use crate::facet::FacetType;
use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
use crate::progress::Progress;
use crate::update::del_add::DelAdd;
use crate::update::facet::new_incremental::FacetsUpdateIncremental;
use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::update::new::facet_search_builder::FacetSearchBuilder;
use crate::update::new::merger::FacetFieldIdDelta;
use crate::update::new::steps::IndexingStep;
use crate::update::new::steps::{IndexingStep, PostProcessingFacets, PostProcessingWords};
use crate::update::new::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
use crate::update::new::words_prefix_docids::{
compute_exact_word_prefix_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids,
@@ -33,11 +34,23 @@ where
{
let index = indexing_context.index;
indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
compute_facet_level_database(index, wtxn, facet_field_ids_delta, &mut global_fields_ids_map)?;
compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
compute_facet_level_database(
index,
wtxn,
facet_field_ids_delta,
&mut global_fields_ids_map,
indexing_context.progress,
)?;
compute_facet_search_database(index, wtxn, global_fields_ids_map, indexing_context.progress)?;
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?;
if let Some(prefix_delta) = compute_word_fst(index, wtxn, indexing_context.progress)? {
compute_prefix_database(
index,
wtxn,
prefix_delta,
indexing_context.grenad_parameters,
indexing_context.progress,
)?;
};
Ok(())
}
@@ -48,21 +61,32 @@ fn compute_prefix_database(
wtxn: &mut RwTxn,
prefix_delta: PrefixDelta,
grenad_parameters: &GrenadParameters,
progress: &Progress,
) -> Result<()> {
let PrefixDelta { modified, deleted } = prefix_delta;
// Compute word prefix docids
progress.update_progress(PostProcessingWords::WordPrefixDocids);
compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
// Compute exact word prefix docids
progress.update_progress(PostProcessingWords::ExactWordPrefixDocids);
compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
// Compute word prefix fid docids
progress.update_progress(PostProcessingWords::WordPrefixFieldIdDocids);
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
// Compute word prefix position docids
progress.update_progress(PostProcessingWords::WordPrefixPositionDocids);
compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing")]
fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result<Option<PrefixDelta>> {
fn compute_word_fst(
index: &Index,
wtxn: &mut RwTxn,
progress: &Progress,
) -> Result<Option<PrefixDelta>> {
let rtxn = index.read_txn()?;
progress.update_progress(PostProcessingWords::WordFst);
let words_fst = index.words_fst(&rtxn)?;
let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
let prefix_settings = index.prefix_settings(&rtxn)?;
@@ -112,8 +136,10 @@ fn compute_facet_search_database(
index: &Index,
wtxn: &mut RwTxn,
global_fields_ids_map: GlobalFieldsIdsMap,
progress: &Progress,
) -> Result<()> {
let rtxn = index.read_txn()?;
progress.update_progress(PostProcessingFacets::FacetSearch);
// if the facet search is not enabled, we can skip the rest of the function
if !index.facet_search(wtxn)? {
@@ -171,10 +197,16 @@ fn compute_facet_level_database(
wtxn: &mut RwTxn,
mut facet_field_ids_delta: FacetFieldIdsDelta,
global_fields_ids_map: &mut GlobalFieldsIdsMap,
progress: &Progress,
) -> Result<()> {
let rtxn = index.read_txn()?;
let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?;
for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() {
let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_string_delta().collect();
// We move all bulks at the front and incrementals (others) at the end.
deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
for (fid, delta) in deltas {
// skip field ids that should not be facet leveled
let Some(metadata) = global_fields_ids_map.metadata(fid) else {
continue;
@@ -187,11 +219,13 @@ fn compute_facet_level_database(
let _entered = span.enter();
match delta {
FacetFieldIdDelta::Bulk => {
progress.update_progress(PostProcessingFacets::StringsBulk);
tracing::debug!(%fid, "bulk string facet processing");
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String)
.execute(wtxn)?
}
FacetFieldIdDelta::Incremental(delta_data) => {
progress.update_progress(PostProcessingFacets::StringsIncremental);
tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing");
FacetsUpdateIncremental::new(
index,
@@ -207,16 +241,22 @@ fn compute_facet_level_database(
}
}
for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() {
let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_number_delta().collect();
// We move all bulks at the front and incrementals (others) at the end.
deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
for (fid, delta) in deltas {
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
let _entered = span.enter();
match delta {
FacetFieldIdDelta::Bulk => {
progress.update_progress(PostProcessingFacets::NumbersBulk);
tracing::debug!(%fid, "bulk number facet processing");
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number)
.execute(wtxn)?
}
FacetFieldIdDelta::Incremental(delta_data) => {
progress.update_progress(PostProcessingFacets::NumbersIncremental);
tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing");
FacetsUpdateIncremental::new(
index,

View File

@@ -7,6 +7,7 @@ use rand::SeedableRng as _;
use time::OffsetDateTime;
use super::super::channel::*;
use crate::database_stats::DatabaseStats;
use crate::documents::PrimaryKey;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::index::IndexEmbeddingConfig;
@@ -142,7 +143,6 @@ pub(super) fn update_index(
embedders: EmbeddingConfigs,
field_distribution: std::collections::BTreeMap<String, u64>,
document_ids: roaring::RoaringBitmap,
modified_docids: roaring::RoaringBitmap,
) -> Result<()> {
index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?;
if let Some(new_primary_key) = new_primary_key {
@@ -153,7 +153,8 @@ pub(super) fn update_index(
index.put_field_distribution(wtxn, &field_distribution)?;
index.put_documents_ids(wtxn, &document_ids)?;
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
index.update_documents_stats(wtxn, modified_docids)?;
let stats = DatabaseStats::new(index.documents.remap_data_type(), wtxn)?;
index.put_documents_stats(wtxn, stats)?;
Ok(())
}

View File

@@ -20,3 +20,23 @@ make_enum_progress! {
Finalizing,
}
}
make_enum_progress! {
pub enum PostProcessingFacets {
StringsBulk,
StringsIncremental,
NumbersBulk,
NumbersIncremental,
FacetSearch,
}
}
make_enum_progress! {
pub enum PostProcessingWords {
WordFst,
WordPrefixDocids,
ExactWordPrefixDocids,
WordPrefixFieldIdDocids,
WordPrefixPositionDocids,
}
}