From 396d76046de8adff191410c01cf4dbfb6bd86efd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 12 Jun 2025 15:41:53 +0200 Subject: [PATCH 1/4] Regenerate embeddings more often: - When `regenerate` was previously `false` and became `true` - When rendering the old version of the docs failed --- .../src/update/new/extract/vectors/mod.rs | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 47bd622ae..0c727fa11 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -141,17 +141,31 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { context.new_fields_ids_map, &context.doc_alloc, )?; - let old_rendered = prompt.render_document( - update.external_document_id(), - update.merged( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - if new_rendered != old_rendered { + let must_regenerate = if !old_vectors.regenerate { + // we just enabled `regenerate` + true + } else { + let old_rendered = prompt.render_document( + update.external_document_id(), + update.merged( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + ); + + if let Ok(old_rendered) = old_rendered { + // must regenerate if the rendered changed + new_rendered != old_rendered + } else { + // cannot check previous rendered, better regenerate + true + } + }; + + if must_regenerate { chunks.set_autogenerated( update.docid(), update.external_document_id(), From 209c4bfc18112e5921008ad91073138f04dc7bb1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 12 Jun 2025 15:47:47 +0200 Subject: [PATCH 2/4] Switch the versions of the documents for rendering :/ --- crates/milli/src/update/new/extract/vectors/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 0c727fa11..77bfc1206 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -133,7 +133,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } else if new_vectors.regenerate { let new_rendered = prompt.render_document( update.external_document_id(), - update.current( + update.merged( &context.rtxn, context.index, context.db_fields_ids_map, @@ -147,7 +147,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } else { let old_rendered = prompt.render_document( update.external_document_id(), - update.merged( + update.current( &context.rtxn, context.index, context.db_fields_ids_map, From 68e7bfb37fd6334803664136ab31c98a299a8e77 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 12 Jun 2025 15:55:33 +0200 Subject: [PATCH 3/4] Don't fail if you cannot render previous version --- .../src/update/new/extract/vectors/mod.rs | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 77bfc1206..2864475c9 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -175,16 +175,6 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } } } else if old_vectors.regenerate { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; let new_rendered = prompt.render_document( update.external_document_id(), update.merged( @@ -195,7 +185,28 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { context.new_fields_ids_map, &context.doc_alloc, )?; - if new_rendered != old_rendered { + + let must_regenerate = { + let old_rendered = prompt.render_document( + update.external_document_id(), + update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + ); + if let Ok(old_rendered) = old_rendered { + // regenerate if the rendered version changed + new_rendered != old_rendered + } else { + // if we cannot render the previous version of the documents, let's regenerate + true + } + }; + + if must_regenerate { chunks.set_autogenerated( update.docid(), update.external_document_id(), From 72004372465895e09cb2e5599d412fb19687a016 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 12 Jun 2025 15:55:52 +0200 Subject: [PATCH 4/4] Comment the cases --- crates/milli/src/update/new/extract/vectors/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 2864475c9..43647e786 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -111,6 +111,8 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { let prompt = chunks.prompt(); let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); + + // case where we have a `_vectors` field in the updated document if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { @@ -130,6 +132,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { error: error.to_string(), })?, )?; + // regenerate if the new `_vectors` fields is set to. } else if new_vectors.regenerate { let new_rendered = prompt.render_document( update.external_document_id(), @@ -174,6 +177,7 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { )?; } } + // no `_vectors` field, so only regenerate if the document is already set to in the DB. } else if old_vectors.regenerate { let new_rendered = prompt.render_document( update.external_document_id(),