mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-23 13:16:33 +00:00
Compare commits
1 Commits
prototype-
...
reduce-pre
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
934b73142d |
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -394,7 +394,8 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
|
||||
[[package]]
|
||||
name = "arroy"
|
||||
version = "0.6.1"
|
||||
source = "git+https://github.com/meilisearch/arroy.git?rev=5b748bac2c69c65a97980901b02067a3a545e357#5b748bac2c69c65a97980901b02067a3a545e357"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
|
||||
@@ -625,8 +625,8 @@ impl IndexScheduler {
|
||||
task_id: Option<TaskId>,
|
||||
dry_run: bool,
|
||||
) -> Result<Task> {
|
||||
// if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incoming task
|
||||
if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty())
|
||||
// if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
|
||||
if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
|
||||
&& (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40
|
||||
{
|
||||
return Err(Error::NoSpaceLeftInTaskQueue);
|
||||
|
||||
@@ -292,6 +292,8 @@ impl Queue {
|
||||
return Ok(task);
|
||||
}
|
||||
|
||||
// Get rid of the mutability.
|
||||
let task = task;
|
||||
self.tasks.register(wtxn, &task)?;
|
||||
|
||||
Ok(task)
|
||||
|
||||
@@ -364,7 +364,7 @@ fn test_task_queue_is_full() {
|
||||
// we won't be able to test this error in an integration test thus as a best effort test IÂ still ensure the error return the expected error code
|
||||
snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
|
||||
|
||||
// Even the task deletion and cancelation that don't delete anything should be refused
|
||||
// Even the task deletion that doesn't delete anything shouldn't be accepted
|
||||
let result = index_scheduler
|
||||
.register(
|
||||
KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() },
|
||||
@@ -373,39 +373,10 @@ fn test_task_queue_is_full() {
|
||||
)
|
||||
.unwrap_err();
|
||||
snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
|
||||
let result = index_scheduler
|
||||
.register(
|
||||
KindWithContent::TaskCancelation { query: S("test"), tasks: RoaringBitmap::new() },
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap_err();
|
||||
snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
|
||||
|
||||
// we won't be able to test this error in an integration test thus as a best effort test IÂ still ensure the error return the expected error code
|
||||
snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
|
||||
|
||||
// But a task cancelation that cancel something should work
|
||||
index_scheduler
|
||||
.register(
|
||||
KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() },
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
handle.advance_one_successful_batch();
|
||||
|
||||
// But we should still be forbidden from enqueuing new tasks
|
||||
let result = index_scheduler
|
||||
.register(
|
||||
KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None },
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap_err();
|
||||
snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
|
||||
|
||||
// And a task deletion that delete something should works
|
||||
// But a task deletion that delete something should works
|
||||
index_scheduler
|
||||
.register(
|
||||
KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() },
|
||||
|
||||
@@ -454,10 +454,7 @@ impl ErrorCode for milli::Error {
|
||||
}
|
||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||
UserError::InvalidVectorDimensions { .. }
|
||||
| UserError::InvalidIndexingVectorDimensions { .. } => {
|
||||
Code::InvalidVectorDimensions
|
||||
}
|
||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
||||
UserError::InvalidVectorsMapType { .. }
|
||||
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
|
||||
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
||||
|
||||
@@ -164,87 +164,6 @@ async fn add_remove_user_provided() {
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn user_provide_mismatched_embedding_dimension() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("doggo");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": {
|
||||
"manual": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
}
|
||||
},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
server.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
|
||||
]);
|
||||
let (value, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let task = index.wait_task(value.uid()).await;
|
||||
snapshot!(task, @r###"
|
||||
{
|
||||
"uid": "[uid]",
|
||||
"batchUid": "[batch_uid]",
|
||||
"indexUid": "doggo",
|
||||
"status": "failed",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 0
|
||||
},
|
||||
"error": {
|
||||
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
|
||||
"code": "invalid_vector_dimensions",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let new_document = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
|
||||
]);
|
||||
let (response, code) = index.add_documents(new_document, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let task = index.wait_task(response.uid()).await;
|
||||
snapshot!(task, @r###"
|
||||
{
|
||||
"uid": "[uid]",
|
||||
"batchUid": "[batch_uid]",
|
||||
"indexUid": "doggo",
|
||||
"status": "failed",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 0
|
||||
},
|
||||
"error": {
|
||||
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
|
||||
"code": "invalid_vector_dimensions",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
async fn generate_default_user_provided_documents(server: &Server) -> Index {
|
||||
let index = server.index("doggo");
|
||||
|
||||
|
||||
@@ -87,8 +87,7 @@ rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838
|
||||
"no_time",
|
||||
"sync",
|
||||
] }
|
||||
# arroy = "0.6.1"
|
||||
arroy = { git = "https://github.com/meilisearch/arroy.git", rev = "5b748bac2c69c65a97980901b02067a3a545e357" } # incremental update
|
||||
arroy = "0.6.1"
|
||||
rand = "0.8.5"
|
||||
tracing = "0.1.41"
|
||||
ureq = { version = "2.12.1", features = ["json"] }
|
||||
|
||||
@@ -129,14 +129,6 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
||||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")]
|
||||
InvalidIndexingVectorDimensions {
|
||||
embedder_name: String,
|
||||
document_id: String,
|
||||
embedding_index: usize,
|
||||
expected: usize,
|
||||
found: usize,
|
||||
},
|
||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||
InvalidVectorsMapType { document_id: String, value: Value },
|
||||
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
||||
|
||||
@@ -266,15 +266,12 @@ impl Step for arroy::MainStep {
|
||||
"writing the descendants and metadata"
|
||||
}
|
||||
arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
|
||||
arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
|
||||
arroy::MainStep::UpdatingTheTrees => "updating the trees",
|
||||
arroy::MainStep::CreateNewTrees => "create new trees",
|
||||
arroy::MainStep::WritingNodesToDatabase => "writing nodes to database",
|
||||
arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
|
||||
arroy::MainStep::WriteTheMetadata => "write the metadata",
|
||||
arroy::MainStep::RetrievingTheItemsIds => "retrieving the items ids",
|
||||
arroy::MainStep::RetrievingTheUsedTreeNodes => "retrieving the used tree nodes",
|
||||
arroy::MainStep::DeletingExtraTrees => "deleting extra trees",
|
||||
arroy::MainStep::RemoveItemsFromExistingTrees => "remove items from existing trees",
|
||||
arroy::MainStep::InsertItemsInCurrentTrees => "insert items in current trees",
|
||||
arroy::MainStep::IncrementalIndexLargeDescendants => {
|
||||
"incremental index large descendants"
|
||||
}
|
||||
}
|
||||
.into()
|
||||
}
|
||||
|
||||
@@ -173,19 +173,17 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ranking_rule_scores.push(ScoreDetails::Skipped);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
let is_below_threshold =
|
||||
ranking_score_threshold.is_some_and(|ranking_score_threshold| {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
current_score < ranking_score_threshold
|
||||
});
|
||||
|
||||
if is_below_threshold {
|
||||
all_candidates -= &bucket;
|
||||
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
|
||||
} else {
|
||||
maybe_add_to_results!(bucket);
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
maybe_add_to_results!(bucket);
|
||||
|
||||
ranking_rule_scores.pop();
|
||||
|
||||
if cur_ranking_rule_index == 0 {
|
||||
@@ -239,24 +237,23 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| {
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
current_score < ranking_score_threshold
|
||||
});
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -=
|
||||
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
if cur_ranking_rule_index == ranking_rules_len - 1
|
||||
|| (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|
||||
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|
||||
|| is_below_threshold
|
||||
{
|
||||
if is_below_threshold {
|
||||
all_candidates -= &next_bucket.candidates;
|
||||
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
|
||||
} else {
|
||||
maybe_add_to_results!(next_bucket.candidates);
|
||||
}
|
||||
maybe_add_to_results!(next_bucket.candidates);
|
||||
ranking_rule_scores.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -37,12 +37,12 @@ pub struct DatabaseCache<'ctx> {
|
||||
|
||||
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<RoaringBitmap>>,
|
||||
pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
|
||||
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<RoaringBitmap>>,
|
||||
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
}
|
||||
@@ -562,14 +562,46 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, fid),
|
||||
&(self.word_interner.get(word_prefix).as_str(), fid),
|
||||
&mut self.db_cache.word_prefix_fid_docids,
|
||||
universe,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
let cache = &mut self.db_cache.word_prefix_fid_docids;
|
||||
let prefix_db = &self.index.word_prefix_fid_docids;
|
||||
let db = &self.index.word_fid_docids;
|
||||
if let Entry::Vacant(entry) = cache.entry((word_prefix, fid)) {
|
||||
let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap();
|
||||
match prefix_db.get(self.txn, &(word_prefix_str, fid))? {
|
||||
Some(mut bitmap) => {
|
||||
if let Some(universe) = universe {
|
||||
bitmap &= universe;
|
||||
}
|
||||
entry.insert(Some(bitmap));
|
||||
}
|
||||
None => {
|
||||
let mut key = word_prefix_bytes.clone();
|
||||
key.push(0);
|
||||
let remap_key_type = db
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
for result in remap_key_type {
|
||||
let ((_, pos), value) = result?;
|
||||
|
||||
if pos == fid {
|
||||
if let Some(universe) = universe {
|
||||
bitmap |= value & universe;
|
||||
} else {
|
||||
bitmap |= value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
entry.insert(Some(bitmap));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(cache.get(&(word_prefix, fid)).unwrap().clone())
|
||||
}
|
||||
|
||||
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
@@ -605,6 +637,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut fids = vec![];
|
||||
// TODO: This is no more exhaustive, we should iterate over all fids.
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_fid_docids
|
||||
@@ -612,11 +645,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_fid_docids
|
||||
.insert((word_prefix, fid), Some(Cow::Borrowed(value)));
|
||||
let ((_, fid), _value) = result?;
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
@@ -648,14 +677,46 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word_prefix: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, position),
|
||||
&(self.word_interner.get(word_prefix).as_str(), position),
|
||||
&mut self.db_cache.word_prefix_position_docids,
|
||||
universe,
|
||||
self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
let cache = &mut self.db_cache.word_prefix_position_docids;
|
||||
let prefix_db = &self.index.word_prefix_position_docids;
|
||||
let db = &self.index.word_position_docids;
|
||||
if let Entry::Vacant(entry) = cache.entry((word_prefix, position)) {
|
||||
let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap();
|
||||
match prefix_db.get(self.txn, &(word_prefix_str, position))? {
|
||||
Some(mut bitmap) => {
|
||||
if let Some(universe) = universe {
|
||||
bitmap &= universe;
|
||||
}
|
||||
entry.insert(Some(bitmap));
|
||||
}
|
||||
None => {
|
||||
let mut key = word_prefix_bytes.clone();
|
||||
key.push(0);
|
||||
let remap_key_type = db
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
for result in remap_key_type {
|
||||
let ((_, pos), value) = result?;
|
||||
|
||||
if pos == position {
|
||||
if let Some(universe) = universe {
|
||||
bitmap |= value & universe;
|
||||
} else {
|
||||
bitmap |= value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
entry.insert(Some(bitmap));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(cache.get(&(word_prefix, position)).unwrap().clone())
|
||||
}
|
||||
|
||||
pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
@@ -696,6 +757,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut positions = vec![];
|
||||
// TODO: This is no more exhaustive, we should iterate over all positions.
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_position_docids
|
||||
@@ -703,11 +765,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_position_docids
|
||||
.insert((word_prefix, position), Some(Cow::Borrowed(value)));
|
||||
let ((_, position), _value) = result?;
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
|
||||
@@ -121,7 +121,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
// do we have set embeddings?
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
update.external_document_id(),
|
||||
update.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
@@ -129,7 +128,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
document_id: update.external_document_id().to_string(),
|
||||
error: error.to_string(),
|
||||
})?,
|
||||
)?;
|
||||
);
|
||||
} else if new_vectors.regenerate {
|
||||
let new_rendered = prompt.render_document(
|
||||
update.external_document_id(),
|
||||
@@ -210,7 +209,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
insertion.external_document_id(),
|
||||
insertion.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
@@ -220,7 +218,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
.to_string(),
|
||||
error: error.to_string(),
|
||||
})?,
|
||||
)?;
|
||||
);
|
||||
} else if new_vectors.regenerate {
|
||||
let rendered = prompt.render_document(
|
||||
insertion.external_document_id(),
|
||||
@@ -275,7 +273,6 @@ struct Chunks<'a, 'b, 'extractor> {
|
||||
embedder: &'a Embedder,
|
||||
embedder_id: u8,
|
||||
embedder_name: &'a str,
|
||||
dimensions: usize,
|
||||
prompt: &'a Prompt,
|
||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||
@@ -300,7 +297,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let dimensions = embedder.dimensions();
|
||||
Self {
|
||||
texts,
|
||||
ids,
|
||||
@@ -313,7 +309,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
embedder_name,
|
||||
user_provided,
|
||||
has_manual_generation: None,
|
||||
dimensions,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -495,25 +490,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
}
|
||||
}
|
||||
|
||||
fn set_vectors(
|
||||
&self,
|
||||
external_docid: &'a str,
|
||||
docid: DocumentId,
|
||||
embeddings: Vec<Embedding>,
|
||||
) -> Result<()> {
|
||||
for (embedding_index, embedding) in embeddings.iter().enumerate() {
|
||||
if embedding.len() != self.dimensions {
|
||||
return Err(UserError::InvalidIndexingVectorDimensions {
|
||||
expected: self.dimensions,
|
||||
found: embedding.len(),
|
||||
embedder_name: self.embedder_name.to_string(),
|
||||
document_id: external_docid.to_string(),
|
||||
embedding_index,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
|
||||
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -291,6 +291,9 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
|
||||
let (_word, pos) = StrBEU16Codec::bytes_decode(key).map_err(Error::Decoding)?;
|
||||
positions.entry(pos).or_insert_with(Vec::new).push(bytes);
|
||||
}
|
||||
|
||||
// We remove all the positions that have less than 100 bitmaps.
|
||||
positions.retain(|_, bitmaps| bitmaps.len() > 100);
|
||||
assert!(prefixes_bitmaps.insert(prefix.as_str(), positions).is_none());
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user