mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-27 16:51:01 +00:00
always push the user defined vectors in arroy
This commit is contained in:
@ -1,244 +0,0 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/attribute_fid.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
2,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 14,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 12,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 83,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 73,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
10,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 78,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
14,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
@ -1,7 +0,0 @@
|
||||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
|
@ -1,7 +0,0 @@
|
||||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
|
@ -8,7 +8,6 @@ use std::sync::Arc;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
@ -50,7 +49,7 @@ enum VectorStateDelta {
|
||||
// Note: changing the value of the manually specified vector **should not record** this delta
|
||||
WasGeneratedNowManual(Vec<Vec<f32>>),
|
||||
|
||||
ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>),
|
||||
ManualDelta(Vec<Vec<f32>>),
|
||||
|
||||
// Add the vector computed from the specified prompt
|
||||
// Remove any previous vector
|
||||
@ -59,14 +58,12 @@ enum VectorStateDelta {
|
||||
}
|
||||
|
||||
impl VectorStateDelta {
|
||||
fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) {
|
||||
fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
|
||||
match self {
|
||||
VectorStateDelta::NoChange => Default::default(),
|
||||
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
|
||||
VectorStateDelta::WasGeneratedNowManual(add) => {
|
||||
(true, Default::default(), (Default::default(), add))
|
||||
}
|
||||
VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)),
|
||||
VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add),
|
||||
VectorStateDelta::ManualDelta(add) => (false, Default::default(), add),
|
||||
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
|
||||
}
|
||||
}
|
||||
@ -166,8 +163,14 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid)
|
||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||
let mut parsed_vectors = ParsedVectorsDiff::new(
|
||||
docid,
|
||||
embedders_configs,
|
||||
obkv,
|
||||
old_vectors_fid,
|
||||
new_vectors_fid,
|
||||
)
|
||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
@ -182,7 +185,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
{
|
||||
let delta = match parsed_vectors.remove(embedder_name) {
|
||||
(Some(old), Some(new)) => {
|
||||
match (old.is_user_provided(), new.is_user_provided()) {
|
||||
match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) {
|
||||
(true, true) | (false, false) => (),
|
||||
(true, false) => {
|
||||
remove_from_user_defined.insert(docid);
|
||||
@ -193,7 +196,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
// no autogeneration
|
||||
let del_vectors = old.into_array_of_vectors();
|
||||
let add_vectors = new.into_array_of_vectors();
|
||||
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
@ -203,15 +205,15 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
|
||||
VectorStateDelta::ManualDelta(add_vectors)
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
(Some(old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
if document_is_kept && old.is_some() {
|
||||
remove_from_user_defined.insert(docid);
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
@ -219,6 +221,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else if document_is_kept && old.is_none() {
|
||||
VectorStateDelta::NoChange
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
@ -315,8 +319,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
/// We cannot compute the diff between both Del and Add vectors.
|
||||
/// We'll push every vector and compute the difference later in TypedChunk.
|
||||
fn push_vectors_diff(
|
||||
remove_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
prompts_writer: &mut Writer<BufWriter<File>>,
|
||||
@ -325,7 +329,7 @@ fn push_vectors_diff(
|
||||
delta: VectorStateDelta,
|
||||
reindex_vectors: bool,
|
||||
) -> Result<()> {
|
||||
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
|
||||
let (must_remove, prompt, mut add_vectors) = delta.into_values();
|
||||
if must_remove
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
@ -340,44 +344,25 @@ fn push_vectors_diff(
|
||||
}
|
||||
|
||||
// We sort and dedup the vectors
|
||||
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
|
||||
let merged_vectors_iter =
|
||||
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
|
||||
// let merged_vectors_iter =
|
||||
// itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
|
||||
|
||||
// insert vectors into the writer
|
||||
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
// Generate the key by extending the unique index to it.
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let index = u16::try_from(i).unwrap();
|
||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(vector) => {
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
if !reindex_vectors {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Right(vector) => {
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
@ -4,8 +4,9 @@ use obkv::KvReader;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::Embedding;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::{FieldId, InternalError, UserError};
|
||||
use crate::{DocumentId, FieldId, InternalError, UserError};
|
||||
|
||||
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
|
||||
|
||||
@ -42,17 +43,19 @@ pub struct ExplicitVectors {
|
||||
}
|
||||
|
||||
pub struct ParsedVectorsDiff {
|
||||
pub old: Option<BTreeMap<String, Vectors>>,
|
||||
pub old: BTreeMap<String, Option<Vectors>>,
|
||||
pub new: Option<BTreeMap<String, Vectors>>,
|
||||
}
|
||||
|
||||
impl ParsedVectorsDiff {
|
||||
pub fn new(
|
||||
docid: DocumentId,
|
||||
embedders_configs: &[IndexEmbeddingConfig],
|
||||
documents_diff: KvReader<'_, FieldId>,
|
||||
old_vectors_fid: Option<FieldId>,
|
||||
new_vectors_fid: Option<FieldId>,
|
||||
) -> Result<Self, Error> {
|
||||
let old = match old_vectors_fid
|
||||
let mut old = match old_vectors_fid
|
||||
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
|
||||
@ -68,7 +71,13 @@ impl ParsedVectorsDiff {
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
.flatten();
|
||||
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect());
|
||||
for embedding_config in embedders_configs {
|
||||
if embedding_config.user_defined.contains(docid) {
|
||||
old.entry(embedding_config.name.to_string()).or_insert(None);
|
||||
}
|
||||
}
|
||||
|
||||
let new = new_vectors_fid
|
||||
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
@ -78,8 +87,9 @@ impl ParsedVectorsDiff {
|
||||
Ok(Self { old, new })
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) {
|
||||
let old = self.old.as_mut().and_then(|old| old.remove(embedder_name));
|
||||
/// Return (Some(None), _) in case the vector is user defined and contained in the database.
|
||||
pub fn remove(&mut self, embedder_name: &str) -> (Option<Option<Vectors>>, Option<Vectors>) {
|
||||
let old = self.old.remove(embedder_name);
|
||||
let new = self.new.as_mut().and_then(|new| new.remove(embedder_name));
|
||||
(old, new)
|
||||
}
|
||||
|
Reference in New Issue
Block a user