Merge remote-tracking branch 'origin/release-v1.16.0' into fix-threshold-overcounting-bug

This commit is contained in:
Mubelotix
2025-07-15 18:01:29 +02:00
32 changed files with 1491 additions and 998 deletions

View File

@ -554,10 +554,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
match self.searchable_fields {
Setting::Set(ref fields) => {
// Check to see if the searchable fields changed before doing anything else
let old_fields = self.index.searchable_fields(self.wtxn)?;
let old_fields = self.index.user_defined_searchable_fields(self.wtxn)?;
let did_change = {
let new_fields = fields.iter().map(String::as_str).collect::<Vec<_>>();
new_fields != old_fields
old_fields.is_none_or(|old| new_fields != old)
};
if !did_change {
return Ok(false);

View File

@ -2,6 +2,7 @@ mod v1_12;
mod v1_13;
mod v1_14;
mod v1_15;
mod v1_16;
use heed::RwTxn;
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13};
@ -10,6 +11,7 @@ use v1_15::Latest_V1_14_To_Latest_V1_15;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::progress::{Progress, VariableNameStep};
use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0;
use crate::{Index, InternalError, Result};
trait UpgradeIndex {
@ -24,6 +26,59 @@ trait UpgradeIndex {
fn target_version(&self) -> (u32, u32, u32);
}
const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[
&V1_12_To_V1_12_3 {},
&V1_12_3_To_V1_13_0 {},
&V1_13_0_To_V1_13_1 {},
&V1_13_1_To_Latest_V1_13 {},
&Latest_V1_13_To_Latest_V1_14 {},
&Latest_V1_14_To_Latest_V1_15 {},
&Latest_V1_15_To_V1_16_0 {},
// This is the last upgrade function, it will be called when the index is up to date.
// any other upgrade function should be added before this one.
&ToCurrentNoOp {},
];
/// Causes a compile-time error if the argument is not in range of `0..UPGRADE_FUNCTIONS.len()`
macro_rules! function_index {
($start:expr) => {{
const _CHECK_INDEX: () = {
if $start >= $crate::update::upgrade::UPGRADE_FUNCTIONS.len() {
panic!("upgrade functions out of range")
}
};
$start
}};
}
const fn start(from: (u32, u32, u32)) -> Option<usize> {
let start = match from {
(1, 12, 0..=2) => function_index!(0),
(1, 12, 3..) => function_index!(1),
(1, 13, 0) => function_index!(2),
(1, 13, _) => function_index!(4),
(1, 14, _) => function_index!(5),
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 15, _) => function_index!(6),
(1, 16, _) => function_index!(7),
// We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually
// considering dumpless upgrade.
(_major, _minor, _patch) => return None,
};
Some(start)
}
/// Causes a compile-time error if the latest package cannot be upgraded.
///
/// This serves as a reminder to consider the proper dumpless upgrade implementation when changing the package version.
const _CHECK_PACKAGE_CAN_UPGRADE: () = {
if start((VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)).is_none() {
panic!("cannot upgrade from latest package version")
}
};
/// Return true if the cached stats of the index must be regenerated
pub fn upgrade<MSP>(
wtxn: &mut RwTxn,
@ -36,33 +91,12 @@ where
MSP: Fn() -> bool + Sync,
{
let from = index.get_version(wtxn)?.unwrap_or(db_version);
let upgrade_functions: &[&dyn UpgradeIndex] = &[
&V1_12_To_V1_12_3 {},
&V1_12_3_To_V1_13_0 {},
&V1_13_0_To_V1_13_1 {},
&V1_13_1_To_Latest_V1_13 {},
&Latest_V1_13_To_Latest_V1_14 {},
&Latest_V1_14_To_Latest_V1_15 {},
// This is the last upgrade function, it will be called when the index is up to date.
// any other upgrade function should be added before this one.
&ToCurrentNoOp {},
];
let start = match from {
(1, 12, 0..=2) => 0,
(1, 12, 3..) => 1,
(1, 13, 0) => 2,
(1, 13, _) => 4,
(1, 14, _) => 5,
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 15, _) => 6,
(major, minor, patch) => {
return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into())
}
};
let start =
start(from).ok_or_else(|| InternalError::CannotUpgradeToVersion(from.0, from.1, from.2))?;
enum UpgradeVersion {}
let upgrade_path = &upgrade_functions[start..];
let upgrade_path = &UPGRADE_FUNCTIONS[start..];
let mut current_version = from;
let mut regenerate_stats = false;

View File

@ -1,4 +1,6 @@
use heed::RwTxn;
use roaring::RoaringBitmap;
use serde::Deserialize;
use super::UpgradeIndex;
use crate::progress::Progress;
@ -26,3 +28,14 @@ impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 {
(1, 15, 0)
}
}
/// Parts of v1.15 `IndexingEmbeddingConfig` that are relevant for upgrade to v1.16
///
/// # Warning
///
/// This object should not be rewritten to the DB, only read to get the name and `user_provided` roaring.
#[derive(Debug, Deserialize)]
pub struct IndexEmbeddingConfig {
pub name: String,
pub user_provided: RoaringBitmap,
}

View File

@ -0,0 +1,48 @@
use heed::types::{SerdeJson, Str};
use heed::RwTxn;
use super::UpgradeIndex;
use crate::progress::Progress;
use crate::vector::db::{EmbedderInfo, EmbeddingStatus};
use crate::{Index, InternalError, Result};
#[allow(non_camel_case_types)]
pub(super) struct Latest_V1_15_To_V1_16_0();
impl UpgradeIndex for Latest_V1_15_To_V1_16_0 {
fn upgrade(
&self,
wtxn: &mut RwTxn,
index: &Index,
_original: (u32, u32, u32),
_progress: Progress,
) -> Result<bool> {
let v1_15_indexing_configs = index
.main
.remap_types::<Str, SerdeJson<Vec<super::v1_15::IndexEmbeddingConfig>>>()
.get(wtxn, crate::index::main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default();
let embedders = index.embedding_configs();
for config in v1_15_indexing_configs {
let embedder_id = embedders.embedder_id(wtxn, &config.name)?.ok_or(
InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
let info = EmbedderInfo {
embedder_id,
// v1.15 used not to make a difference between `user_provided` and `! regenerate`.
embedding_status: EmbeddingStatus::from_user_provided(config.user_provided),
};
embedders.put_embedder_info(wtxn, &config.name, &info)?;
}
Ok(false)
}
fn target_version(&self) -> (u32, u32, u32) {
(1, 16, 0)
}
}

View File

@ -117,6 +117,13 @@ impl EmbeddingStatus {
Default::default()
}
/// Create a new `EmbeddingStatus` that assumes that any `user_provided` docid is also skipping regenerate.
///
/// Used for migration from v1.15 and earlier DBs.
pub(crate) fn from_user_provided(user_provided: RoaringBitmap) -> Self {
Self { user_provided, skip_regenerate_different_from_user_provided: Default::default() }
}
/// Whether the document contains user-provided vectors for that embedder.
pub fn is_user_provided(&self, docid: DocumentId) -> bool {
self.user_provided.contains(docid)