4597: Fix embeddings settings update r=ManyTheFish a=ManyTheFish

# Pull Request
- add some conditions reducing the work done when changing the settings
- add some benchmarks on embedders

## Related issue
Fixes #4585


Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot]
2024-04-25 16:37:28 +00:00
committed by GitHub
3 changed files with 166 additions and 11 deletions

View File

@@ -198,11 +198,16 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
if document_is_kept { if document_is_kept {
// Don't give up if the old prompt was failing // Don't give up if the old prompt was failing
let old_prompt = prompt let old_prompt = Some(prompt)
.render(obkv, DelAdd::Deletion, old_fields_ids_map) // TODO: this filter works because we erase the vec database when a embedding setting changes.
.unwrap_or_default(); // When vector pipeline will be optimized, this should be removed.
.filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
});
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt != new_prompt { if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!( tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
); );
@@ -224,6 +229,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
&mut manual_vectors_writer, &mut manual_vectors_writer,
&mut key_buffer, &mut key_buffer,
delta, delta,
settings_diff,
)?; )?;
} }
@@ -264,10 +270,15 @@ fn push_vectors_diff(
manual_vectors_writer: &mut Writer<BufWriter<File>>, manual_vectors_writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>, key_buffer: &mut Vec<u8>,
delta: VectorStateDelta, delta: VectorStateDelta,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<()> { ) -> Result<()> {
puffin::profile_function!(); puffin::profile_function!();
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
if must_remove { if must_remove
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
&& !settings_diff.reindex_vectors()
{
key_buffer.truncate(TRUNCATE_SIZE); key_buffer.truncate(TRUNCATE_SIZE);
remove_vectors_writer.insert(&key_buffer, [])?; remove_vectors_writer.insert(&key_buffer, [])?;
} }
@@ -295,12 +306,16 @@ fn push_vectors_diff(
match eob { match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(vector) => { EitherOrBoth::Left(vector) => {
// We insert only the Del part of the Obkv to inform // TODO: the below condition works because we erase the vec database when a embedding setting changes.
// that we only want to remove all those vectors. // When vector pipeline will be optimized, this should be removed.
let mut obkv = KvWriterDelAdd::memory(); if !settings_diff.reindex_vectors() {
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; // We insert only the Del part of the Obkv to inform
let bytes = obkv.into_inner()?; // that we only want to remove all those vectors.
manual_vectors_writer.insert(&key_buffer, bytes)?; let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
} }
EitherOrBoth::Right(vector) => { EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform // We insert only the Add part of the Obkv to inform

View File

@@ -0,0 +1,68 @@
{
"name": "movies-subset-hf-embeddings",
"run_count": 5,
"extra_cli_args": [
"--max-indexing-threads=4"
],
"assets": {
"movies-100.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
}
},
"commands": [
{
"route": "experimental-features",
"method": "PATCH",
"body": {
"inline": {
"vectorStore": true
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"embedders": {
"default": {
"source": "huggingFace"
}
}
}
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies-100.json"
},
"synchronous": "WaitForTask"
}
]
}

View File

@@ -0,0 +1,72 @@
{
"name": "settings-add-embeddings-hf",
"run_count": 5,
"extra_cli_args": [
"--max-indexing-threads=4"
],
"assets": {
"movies-100.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
}
},
"commands": [
{
"route": "experimental-features",
"method": "PATCH",
"body": {
"inline": {
"vectorStore": true
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies-100.json"
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"embedders": {
"default": {
"source": "huggingFace",
"model": null,
"revision": null,
"documentTemplate": null,
"distribution": null
}
}
}
},
"synchronous": "WaitForTask"
}
]
}