mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-14 00:36:25 +00:00
WIP
- manual embedder - multi embedders OK - clippy + tests OK
This commit is contained in:
@ -73,6 +73,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
field_id_map: &FieldsIdsMap,
|
||||
prompt: &Prompt,
|
||||
embedder_name: &str,
|
||||
) -> Result<ExtractedVectorPoints> {
|
||||
puffin::profile_function!();
|
||||
|
||||
@ -115,89 +116,87 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
let delta = if let Some(value) = vectors_fid.and_then(|vectors_fid| obkv.get(vectors_fid)) {
|
||||
let vectors_obkv = KvReaderDelAdd::new(value);
|
||||
match (vectors_obkv.get(DelAdd::Deletion), vectors_obkv.get(DelAdd::Addition)) {
|
||||
(Some(old), Some(new)) => {
|
||||
// no autogeneration
|
||||
let del_vectors = extract_vectors(old, document_id)?;
|
||||
let add_vectors = extract_vectors(new, document_id)?;
|
||||
let vectors_field = vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_maps(obkv, document_id))
|
||||
.transpose()?;
|
||||
|
||||
VectorStateDelta::ManualDelta(
|
||||
del_vectors.unwrap_or_default(),
|
||||
add_vectors.unwrap_or_default(),
|
||||
)
|
||||
}
|
||||
(None, Some(new)) => {
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = extract_vectors(new, document_id)?;
|
||||
let (del_map, add_map) = vectors_field.unzip();
|
||||
let del_map = del_map.flatten();
|
||||
let add_map = add_map.flatten();
|
||||
|
||||
VectorStateDelta::WasGeneratedNowManual(add_vectors.unwrap_or_default())
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
field_id_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
(None, None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
|
||||
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt =
|
||||
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
|
||||
if old_prompt != new_prompt {
|
||||
log::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
log::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
let delta = match (del_value, add_value) {
|
||||
(Some(old), Some(new)) => {
|
||||
// no autogeneration
|
||||
let del_vectors = extract_vectors(old, document_id, embedder_name)?;
|
||||
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
|
||||
|
||||
if add_vectors.len() > u8::MAX.into() {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
field_id_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt =
|
||||
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
|
||||
if old_prompt != new_prompt {
|
||||
log::trace!("🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}");
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
log::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
(None, Some(new)) => {
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
|
||||
if add_vectors.len() > u8::MAX.into() {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::WasGeneratedNowManual(add_vectors)
|
||||
}
|
||||
(None, None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt =
|
||||
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
|
||||
if old_prompt != new_prompt {
|
||||
log::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
log::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
};
|
||||
|
||||
@ -221,6 +220,34 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
})
|
||||
}
|
||||
|
||||
fn to_vector_maps(
|
||||
obkv: KvReaderDelAdd,
|
||||
document_id: impl Fn() -> Value,
|
||||
) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
|
||||
let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
|
||||
let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
|
||||
Ok((del, add))
|
||||
}
|
||||
|
||||
fn to_vector_map(
|
||||
obkv: KvReaderDelAdd,
|
||||
side: DelAdd,
|
||||
document_id: &impl Fn() -> Value,
|
||||
) -> Result<Option<serde_json::Map<String, Value>>> {
|
||||
Ok(if let Some(value) = obkv.get(side) {
|
||||
let Ok(value) = from_slice(value) else {
|
||||
let value = from_slice(value).map_err(InternalError::SerdeJson)?;
|
||||
return Err(crate::Error::UserError(UserError::InvalidVectorsMapType {
|
||||
document_id: document_id(),
|
||||
value,
|
||||
}));
|
||||
};
|
||||
Some(value)
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn push_vectors_diff(
|
||||
@ -286,12 +313,20 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
}
|
||||
|
||||
/// Extracts the vectors from a JSON value.
|
||||
fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> {
|
||||
match from_slice(value) {
|
||||
Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)),
|
||||
fn extract_vectors(
|
||||
value: Value,
|
||||
document_id: impl Fn() -> Value,
|
||||
name: &str,
|
||||
) -> Result<Vec<Vec<f32>>> {
|
||||
// FIXME: ugly clone of the vectors here
|
||||
match serde_json::from_value(value.clone()) {
|
||||
Ok(vectors) => {
|
||||
Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors).unwrap_or_default())
|
||||
}
|
||||
Err(_) => Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(value).map_err(InternalError::SerdeJson)?,
|
||||
value,
|
||||
subfield: name.to_owned(),
|
||||
}
|
||||
.into()),
|
||||
}
|
||||
|
@ -298,6 +298,7 @@ fn send_original_documents_data(
|
||||
indexer,
|
||||
&field_id_map,
|
||||
&prompt,
|
||||
&name,
|
||||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
|
Reference in New Issue
Block a user