First better working version

This commit is contained in:
Kerollmops
2025-11-13 12:31:07 +01:00
parent 30b6a41b8d
commit 9736c0e78e
4 changed files with 162 additions and 55 deletions

View File

@@ -2,12 +2,14 @@ use std::cell::RefCell;
use std::collections::HashMap;
use std::mem::size_of;
use std::ops::DerefMut as _;
use std::sync::RwLock;
use bumpalo::collections::vec::Vec as BumpVec;
use bumpalo::Bump;
use super::match_searchable_field;
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::attribute_patterns::match_field_legacy;
use crate::fields_ids_map::metadata::Metadata;
use crate::update::new::document::{Document, DocumentContext};
use crate::update::new::extract::cache::BalancedCaches;
@@ -23,7 +25,10 @@ use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::update::new::{DocumentChange, DocumentIdentifiers};
use crate::update::settings::SettingsDelta;
use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
use crate::{
bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, PatternMatch, Result, UserError,
MAX_POSITION_PER_ATTRIBUTE,
};
const MAX_COUNTED_WORDS: usize = 30;
@@ -330,6 +335,24 @@ impl WordDocidsExtractors {
exact_attributes.iter().any(|attr| contained_in(fname, attr))
|| disabled_typos_terms.is_exact(word)
};
let mut should_tokenize = |field_name: &str| {
let Some((field_id, meta)) = new_fields_ids_map.id_with_metadata_or_insert(field_name)
else {
return Err(UserError::AttributeLimitReached.into());
};
let pattern_match = if meta.is_searchable() {
PatternMatch::Match
} else {
// TODO: should be a match on the field_name using `match_field_legacy` function,
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
PatternMatch::Parent
};
Ok((field_id, pattern_match))
};
match document_change {
DocumentChange::Deletion(inner) => {
let mut token_fn = |fname: &str, fid, pos, word: &str| {
@@ -344,7 +367,7 @@ impl WordDocidsExtractors {
};
document_tokenizer.tokenize_document(
inner.current(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map,
&mut should_tokenize,
&mut token_fn,
)?;
}
@@ -372,7 +395,7 @@ impl WordDocidsExtractors {
};
document_tokenizer.tokenize_document(
inner.current(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map,
&mut should_tokenize,
&mut token_fn,
)?;
@@ -388,7 +411,7 @@ impl WordDocidsExtractors {
};
document_tokenizer.tokenize_document(
inner.merged(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map,
&mut should_tokenize,
&mut token_fn,
)?;
}
@@ -405,7 +428,7 @@ impl WordDocidsExtractors {
};
document_tokenizer.tokenize_document(
inner.inserted(),
new_fields_ids_map,
&mut should_tokenize,
&mut token_fn,
)?;
}
@@ -528,50 +551,113 @@ impl SettingsChangeWordDocidsExtractors {
document_tokenizer: &DocumentTokenizer,
settings_delta: &SD,
) -> Result<()> {
let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
let cached_sorter = cached_sorter_ref.as_mut().unwrap();
let doc_alloc = &context.doc_alloc;
// TODO extract words based on the settings delta here
//
// Note: In insert_del_u32 we should touch the word_fid_docids and
// the fid_word_count_docids if the current field has been added
// or deleted from the list (we can add a boolean to help).
dbg!(document.external_document_id());
// TODO do this outside the loop
let new_fields_ids_map = settings_delta.new_fields_ids_map();
let old_fields_ids_map = context.index.fields_ids_map_with_metadata(&context.rtxn)?;
let old_searchable = settings_delta.old_searchable_attributes().as_ref();
let new_searchable = settings_delta.new_searchable_attributes().as_ref();
let current_document = document.current(
&context.rtxn,
context.index,
old_fields_ids_map.as_fields_ids_map(),
)?;
for result in current_document.iter_top_level_fields() {
let (field_name, field_value) = result?;
let field_id = old_fields_ids_map.id(field_name).unwrap();
let mut should_tokenize = |field_name: &str| {
let field_id = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
match (new_field_metadata, old_field_metadata) {
(Metadata { searchable: Some(_), .. }, Metadata { searchable: None, .. }) => {
eprintln!(
"The document with id `{}` has the field `{}` that must be deleted (be careful)",
document.external_document_id(), field_name,
);
}
(Metadata { searchable: None, .. }, Metadata { searchable: Some(_), .. }) => {
eprintln!(
"The document with id `{}` has the field `{}` that must be tokenized",
document.external_document_id(),
field_name,
);
}
_ => todo!(),
}
let pattern_match =
if old_field_metadata.is_searchable() || new_field_metadata.is_searchable() {
PatternMatch::Match
// If any old or new field is searchable then we need to iterate over all fields
// else if any field matches we need to iterate over all fields
} else if old_searchable.zip(new_searchable).map_or(true, |(old, new)| {
old.iter()
.chain(new)
.any(|attr| match_field_legacy(attr, field_name) == PatternMatch::Parent)
}) {
PatternMatch::Parent
} else {
PatternMatch::NoMatch
};
// TODO extract words from the document here
}
Ok((field_id, pattern_match))
};
let mut token_fn = |_field_name: &str, field_id, pos, word: &str| {
let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
match (old_field_metadata, new_field_metadata) {
(
Metadata { searchable: Some(_), exact: old_exact, .. },
Metadata { searchable: None, .. },
) => {
cached_sorter.insert_del_u32(
field_id,
pos,
word,
// TODO don't forget to check `disabled_typos_terms` and add it to the SettingsDelta
old_exact,
document.docid(),
doc_alloc,
)
}
(
Metadata { searchable: None, .. },
Metadata { searchable: Some(_), exact: new_exact, .. },
) => {
cached_sorter.insert_add_u32(
field_id,
pos,
word,
// TODO same
new_exact,
document.docid(),
doc_alloc,
)
}
(Metadata { exact: old_exact, .. }, Metadata { exact: new_exact, .. }) => {
cached_sorter.insert_del_u32(
field_id,
pos,
word,
// TODO same
old_exact,
document.docid(),
doc_alloc,
)?;
cached_sorter.insert_add_u32(
field_id,
pos,
word,
// TODO same
new_exact,
document.docid(),
doc_alloc,
)
}
}
};
document_tokenizer.tokenize_document(
current_document,
&mut should_tokenize,
&mut token_fn,
)?;
Ok(())
}

View File

@@ -16,7 +16,9 @@ use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE};
use crate::{
FieldId, GlobalFieldsIdsMap, PatternMatch, Result, UserError, MAX_POSITION_PER_ATTRIBUTE,
};
pub struct WordPairProximityDocidsExtractorData<'a> {
tokenizer: DocumentTokenizer<'a>,
@@ -279,7 +281,24 @@ fn process_document_tokens<'doc>(
word_positions.push_back((Rc::from(word), pos));
Ok(())
};
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
let mut should_tokenize = |field_name: &str| {
let Some((field_id, meta)) = fields_ids_map.id_with_metadata_or_insert(field_name) else {
return Err(UserError::AttributeLimitReached.into());
};
let pattern_match = if meta.is_searchable() {
PatternMatch::Match
} else {
// TODO: should be a match on the field_name using `match_field_legacy` function,
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
PatternMatch::Parent
};
Ok((field_id, pattern_match))
};
document_tokenizer.tokenize_document(document, &mut should_tokenize, &mut token_fn)?;
drain_word_positions(word_positions, word_pair_proximity);
Ok(())

View File

@@ -8,10 +8,7 @@ use crate::update::new::document::Document;
use crate::update::new::extract::perm_json_p::{
seek_leaf_values_in_array, seek_leaf_values_in_object, Depth,
};
use crate::{
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
MAX_WORD_LENGTH,
};
use crate::{FieldId, InternalError, LocalizedAttributesRule, Result, MAX_WORD_LENGTH};
// todo: should be crate::proximity::MAX_DISTANCE but it has been forgotten
const MAX_DISTANCE: u32 = 8;
@@ -26,22 +23,16 @@ impl DocumentTokenizer<'_> {
pub fn tokenize_document<'doc>(
&self,
document: impl Document<'doc>,
field_id_map: &mut GlobalFieldsIdsMap,
should_tokenize: &mut impl FnMut(&str) -> Result<(FieldId, PatternMatch)>,
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
) -> Result<()> {
let mut field_position = HashMap::new();
let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else {
return Err(UserError::AttributeLimitReached.into());
};
if meta.is_searchable() {
let (field_id, pattern_match) = should_tokenize(field_name)?;
if pattern_match == PatternMatch::Match {
self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?;
}
// todo: should be a match on the field_name using `match_field_legacy` function,
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
Ok(PatternMatch::Match)
Ok(pattern_match)
};
for entry in document.iter_top_level_fields() {
@@ -192,7 +183,7 @@ mod test {
use super::*;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::update::new::document::{DocumentFromVersions, Versions};
use crate::FieldsIdsMap;
use crate::{FieldsIdsMap, GlobalFieldsIdsMap};
#[test]
fn test_tokenize_document() {

View File

@@ -1633,10 +1633,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
let embedding_config_updates = self.update_embedding_configs()?;
self.update_user_defined_searchable_attributes()?;
let mut new_inner_settings =
InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
// TODO maybe not needed?
new_inner_settings.recompute_searchables(self.wtxn, self.index)?;
// Note that we don't need to update the searchables here,
// as it will be done after the settings update.
let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
let primary_key_id = self
.index
@@ -2579,6 +2578,11 @@ fn deserialize_sub_embedder(
/// Implement this trait for the settings delta type.
/// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`.
pub trait SettingsDelta {
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
fn new_searchable_attributes(&self) -> &Option<Vec<String>>;
fn old_searchable_attributes(&self) -> &Option<Vec<String>>;
fn new_embedders(&self) -> &RuntimeEmbedders;
fn old_embedders(&self) -> &RuntimeEmbedders;
fn new_embedder_category_id(&self) -> &HashMap<String, u8>;
@@ -2590,7 +2594,6 @@ pub trait SettingsDelta {
) -> std::result::Result<(), E>
where
F: FnMut(FragmentDiff) -> std::result::Result<(), E>;
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
}
pub struct FragmentDiff<'a> {
@@ -2599,6 +2602,18 @@ pub struct FragmentDiff<'a> {
}
impl SettingsDelta for InnerIndexSettingsDiff {
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
&self.new.fields_ids_map
}
fn new_searchable_attributes(&self) -> &Option<Vec<String>> {
&self.new.user_defined_searchable_attributes
}
fn old_searchable_attributes(&self) -> &Option<Vec<String>> {
&self.old.user_defined_searchable_attributes
}
fn new_embedders(&self) -> &RuntimeEmbedders {
&self.new.runtime_embedders
}
@@ -2615,10 +2630,6 @@ impl SettingsDelta for InnerIndexSettingsDiff {
&self.embedding_config_updates
}
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
&self.new.fields_ids_map
}
fn try_for_each_fragment_diff<F, E>(
&self,
embedder_name: &str,