mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-24 13:37:19 +00:00
Compare commits
29 Commits
latest
...
new-search
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
32e23ba1b2 | ||
|
|
1517a3dd29 | ||
|
|
6d2198d678 | ||
|
|
28ae67501c | ||
|
|
7ec1152068 | ||
|
|
6d1a58da84 | ||
|
|
5feae8d858 | ||
|
|
bf175c3ed3 | ||
|
|
f3d7595e5f | ||
|
|
9a9061267b | ||
|
|
92bc52cd60 | ||
|
|
d5511db234 | ||
|
|
cf62af13e8 | ||
|
|
91cf94c196 | ||
|
|
753ba39199 | ||
|
|
3944c25853 | ||
|
|
925bce5fbd | ||
|
|
62065ed30d | ||
|
|
97e6ae1957 | ||
|
|
5ed9be0789 | ||
|
|
7597b1049f | ||
|
|
d99150f21b | ||
|
|
c9726674a0 | ||
|
|
205f40b3b8 | ||
|
|
3d013cdebe | ||
|
|
ddeff5678f | ||
|
|
a235434910 | ||
|
|
a376525348 | ||
|
|
361580f451 |
6
.github/workflows/publish-release-assets.yml
vendored
6
.github/workflows/publish-release-assets.yml
vendored
@@ -65,9 +65,9 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [macos-13, windows-2022]
|
||||
os: [macos-14, windows-2022]
|
||||
include:
|
||||
- os: macos-13
|
||||
- os: macos-14
|
||||
artifact_name: meilisearch
|
||||
asset_name: meilisearch-macos-amd64
|
||||
- os: windows-2022
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
|
||||
publish-macos-apple-silicon:
|
||||
name: Publish binary for macOS silicon
|
||||
runs-on: macos-13
|
||||
runs-on: macos-14
|
||||
needs: check-version
|
||||
strategy:
|
||||
matrix:
|
||||
|
||||
2
.github/workflows/test-suite.yml
vendored
2
.github/workflows/test-suite.yml
vendored
@@ -47,7 +47,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [macos-13, windows-2022]
|
||||
os: [macos-14, windows-2022]
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- name: Cache dependencies
|
||||
|
||||
@@ -438,12 +438,15 @@ async fn multipart_stream_to_s3(
|
||||
db_name: String,
|
||||
reader: std::io::PipeReader,
|
||||
) -> Result<(), Error> {
|
||||
use std::{collections::VecDeque, os::fd::OwnedFd, path::PathBuf};
|
||||
use std::collections::VecDeque;
|
||||
use std::io;
|
||||
use std::os::fd::OwnedFd;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use reqwest::{Client, Response};
|
||||
use rusty_s3::S3Action as _;
|
||||
use rusty_s3::{actions::CreateMultipartUpload, Bucket, BucketError, Credentials, UrlStyle};
|
||||
use rusty_s3::actions::CreateMultipartUpload;
|
||||
use rusty_s3::{Bucket, BucketError, Credentials, S3Action as _, UrlStyle};
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
let reader = OwnedFd::from(reader);
|
||||
@@ -517,7 +520,6 @@ async fn multipart_stream_to_s3(
|
||||
while buffer.len() < (s3_multipart_part_size as usize / 2) {
|
||||
// Wait for the pipe to be readable
|
||||
|
||||
use std::io;
|
||||
reader.readable().await?;
|
||||
|
||||
match reader.try_read_buf(&mut buffer) {
|
||||
@@ -581,15 +583,17 @@ async fn multipart_stream_to_s3(
|
||||
async move {
|
||||
match client.post(url).body(body).send().await {
|
||||
Ok(resp) if resp.status().is_client_error() => {
|
||||
resp.error_for_status().map_err(backoff::Error::Permanent)
|
||||
Err(backoff::Error::Permanent(Error::S3Error {
|
||||
status: resp.status(),
|
||||
body: resp.text().await.unwrap_or_default(),
|
||||
}))
|
||||
}
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(e) => Err(backoff::Error::transient(e)),
|
||||
Err(e) => Err(backoff::Error::transient(Error::S3HttpError(e))),
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(Error::S3HttpError)?;
|
||||
.await?;
|
||||
|
||||
let status = resp.status();
|
||||
let body = resp.text().await.map_err(|e| Error::S3Error { status, body: e.to_string() })?;
|
||||
|
||||
@@ -195,7 +195,7 @@ struct Infos {
|
||||
experimental_enable_logs_route: bool,
|
||||
experimental_reduce_indexing_memory_usage: bool,
|
||||
experimental_max_number_of_batched_tasks: usize,
|
||||
experimental_limit_batched_tasks_total_size: u64,
|
||||
experimental_limit_batched_tasks_total_size: Option<u64>,
|
||||
experimental_network: bool,
|
||||
experimental_multimodal: bool,
|
||||
experimental_chat_completions: bool,
|
||||
@@ -359,7 +359,7 @@ impl Infos {
|
||||
http_payload_size_limit,
|
||||
experimental_max_number_of_batched_tasks,
|
||||
experimental_limit_batched_tasks_total_size:
|
||||
experimental_limit_batched_tasks_total_size.into(),
|
||||
experimental_limit_batched_tasks_total_size.map(|size| size.as_u64()),
|
||||
task_queue_webhook: task_webhook_url.is_some(),
|
||||
task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
|
||||
log_level: log_level.to_string(),
|
||||
|
||||
@@ -230,7 +230,17 @@ pub fn setup_meilisearch(
|
||||
cleanup_enabled: !opt.experimental_replication_parameters,
|
||||
max_number_of_tasks: 1_000_000,
|
||||
max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
|
||||
batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.into(),
|
||||
batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.map_or_else(
|
||||
|| {
|
||||
opt.indexer_options
|
||||
.max_indexing_memory
|
||||
// By default, we use half of the available memory to determine the size of batched tasks
|
||||
.map_or(u64::MAX, |mem| mem.as_u64() / 2)
|
||||
// And never exceed 10 GiB when we infer the limit
|
||||
.min(10 * 1024 * 1024 * 1024)
|
||||
},
|
||||
|size| size.as_u64(),
|
||||
),
|
||||
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize,
|
||||
index_count: DEFAULT_INDEX_COUNT,
|
||||
instance_features: opt.to_instance_features(),
|
||||
|
||||
@@ -473,11 +473,14 @@ pub struct Opt {
|
||||
#[serde(default = "default_limit_batched_tasks")]
|
||||
pub experimental_max_number_of_batched_tasks: usize,
|
||||
|
||||
/// Experimentally reduces the maximum total size, in bytes, of tasks that will be processed at once,
|
||||
/// see: <https://github.com/orgs/meilisearch/discussions/801>
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, default_value_t = default_limit_batched_tasks_total_size())]
|
||||
#[serde(default = "default_limit_batched_tasks_total_size")]
|
||||
pub experimental_limit_batched_tasks_total_size: Byte,
|
||||
/// Experimentally controls the maximum total size, in bytes, of tasks that will be processed
|
||||
/// simultaneously. When unspecified, defaults to half of the maximum indexing memory and
|
||||
/// clamped to 10 GiB.
|
||||
///
|
||||
/// See: <https://github.com/orgs/meilisearch/discussions/801>
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE)]
|
||||
#[serde(default)]
|
||||
pub experimental_limit_batched_tasks_total_size: Option<Byte>,
|
||||
|
||||
/// Enables experimental caching of search query embeddings. The value represents the maximal number of entries in the cache of each
|
||||
/// distinct embedder.
|
||||
@@ -701,10 +704,12 @@ impl Opt {
|
||||
MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
|
||||
experimental_max_number_of_batched_tasks.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE,
|
||||
experimental_limit_batched_tasks_total_size.to_string(),
|
||||
);
|
||||
if let Some(limit) = experimental_limit_batched_tasks_total_size {
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE,
|
||||
limit.to_string(),
|
||||
);
|
||||
}
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES,
|
||||
experimental_embedding_cache_entries.to_string(),
|
||||
@@ -1273,10 +1278,6 @@ fn default_limit_batched_tasks() -> usize {
|
||||
usize::MAX
|
||||
}
|
||||
|
||||
fn default_limit_batched_tasks_total_size() -> Byte {
|
||||
Byte::from_u64(u64::MAX)
|
||||
}
|
||||
|
||||
fn default_embedding_cache_entries() -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use crate::search::{Personalize, SearchResult};
|
||||
use meilisearch_types::{
|
||||
error::{Code, ErrorCode, ResponseError},
|
||||
milli::TimeBudget,
|
||||
};
|
||||
use std::time::Duration;
|
||||
|
||||
use meilisearch_types::error::{Code, ErrorCode, ResponseError};
|
||||
use meilisearch_types::milli::TimeBudget;
|
||||
use rand::Rng;
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::search::{Personalize, SearchResult};
|
||||
|
||||
const COHERE_API_URL: &str = "https://api.cohere.ai/v1/rerank";
|
||||
const MAX_RETRIES: u32 = 10;
|
||||
|
||||
|
||||
@@ -18,10 +18,9 @@ use serde::{Deserialize, Serialize};
|
||||
use utoipa::ToSchema;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::search::SearchMetadata;
|
||||
|
||||
use super::super::{ComputedFacets, FacetStats, HitsInfo, SearchHit, SearchQueryWithIndex};
|
||||
use crate::milli::vector::Embedding;
|
||||
use crate::search::SearchMetadata;
|
||||
|
||||
pub const DEFAULT_FEDERATED_WEIGHT: f64 = 1.0;
|
||||
|
||||
|
||||
@@ -1339,3 +1339,117 @@ async fn get_document_with_vectors() {
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn test_fetch_documents_pagination_with_sorting() {
|
||||
let server = Server::new_shared();
|
||||
let index = server.unique_index();
|
||||
let (task, _code) = index.create(None).await;
|
||||
server.wait_task(task.uid()).await.succeeded();
|
||||
|
||||
// Set name as sortable attribute
|
||||
let (task, code) = index.update_settings_sortable_attributes(json!(["name"])).await;
|
||||
assert_eq!(code, 202);
|
||||
server.wait_task(task.uid()).await.succeeded();
|
||||
|
||||
let documents = json!((0..50)
|
||||
.map(|i| json!({"id": i, "name": format!("doc_{:05}", std::cmp::min(i, 5))}))
|
||||
.collect::<Vec<_>>());
|
||||
|
||||
// Add documents as described in the bug report
|
||||
let (task, code) = index.add_documents(documents, None).await;
|
||||
assert_eq!(code, 202);
|
||||
server.wait_task(task.uid()).await.succeeded();
|
||||
|
||||
// Request 1 (first page): offset 0, limit 2
|
||||
let (response, code) = index
|
||||
.fetch_documents(json!({
|
||||
"offset": 0,
|
||||
"limit": 2,
|
||||
"sort": ["name:asc"]
|
||||
}))
|
||||
.await;
|
||||
assert_eq!(code, 200);
|
||||
let results = response["results"].as_array().unwrap();
|
||||
snapshot!(json_string!(results), @r###"
|
||||
[
|
||||
{
|
||||
"id": 0,
|
||||
"name": "doc_00000"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"name": "doc_00001"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
|
||||
// Request 2 (second page): offset 2, limit 2
|
||||
let (response, code) = index
|
||||
.fetch_documents(json!({
|
||||
"offset": 2,
|
||||
"limit": 2,
|
||||
"sort": ["name:asc"]
|
||||
}))
|
||||
.await;
|
||||
assert_eq!(code, 200);
|
||||
let results = response["results"].as_array().unwrap();
|
||||
snapshot!(json_string!(results), @r###"
|
||||
[
|
||||
{
|
||||
"id": 2,
|
||||
"name": "doc_00002"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"name": "doc_00003"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
|
||||
// Request 3 (third page): offset 4, limit 2
|
||||
let (response, code) = index
|
||||
.fetch_documents(json!({
|
||||
"offset": 4,
|
||||
"limit": 2,
|
||||
"sort": ["name:asc"]
|
||||
}))
|
||||
.await;
|
||||
assert_eq!(code, 200);
|
||||
let results = response["results"].as_array().unwrap();
|
||||
snapshot!(json_string!(results), @r###"
|
||||
[
|
||||
{
|
||||
"id": 4,
|
||||
"name": "doc_00004"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"name": "doc_00005"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
|
||||
// Request 4 (fourth page): offset 6, limit 2
|
||||
let (response, code) = index
|
||||
.fetch_documents(json!({
|
||||
"offset": 6,
|
||||
"limit": 2,
|
||||
"sort": ["name:asc"]
|
||||
}))
|
||||
.await;
|
||||
assert_eq!(code, 200);
|
||||
let results = response["results"].as_array().unwrap();
|
||||
snapshot!(json_string!(results), @r###"
|
||||
[
|
||||
{
|
||||
"id": 6,
|
||||
"name": "doc_00005"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"name": "doc_00005"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
@@ -137,6 +137,60 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
}])
|
||||
});
|
||||
|
||||
static MANY_DOCS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"title": "Captain Planet",
|
||||
"desc": "He's not part of the Marvel Cinematic Universe",
|
||||
"id": "2",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "3",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "4",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "5",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "6",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "7",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "8",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "9",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "10",
|
||||
}])
|
||||
});
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search() {
|
||||
let server = Server::new_shared();
|
||||
@@ -449,6 +503,38 @@ async fn simple_search_hf() {
|
||||
snapshot!(response["semanticHitCount"], @"3");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn issue_5976_missing_docs_hf() {
|
||||
let server = Server::new_shared();
|
||||
let index = index_with_documents_hf(server, &MANY_DOCS).await;
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Wonder replacement", "hybrid": {"embedder": "default", "semanticRatio": 1.0}, "retrieveVectors": true}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
let are_empty: Vec<_> = response["hits"]
|
||||
.as_array()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|hit| hit["_vectors"]["default"]["embeddings"].as_array().unwrap().is_empty())
|
||||
.collect();
|
||||
snapshot!(json!(are_empty), @r###"
|
||||
[
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn distribution_shift() {
|
||||
let server = Server::new_shared();
|
||||
|
||||
@@ -87,7 +87,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
|
||||
};
|
||||
|
||||
// Otherwise don't directly iterate over children, skip them if we know we will go further
|
||||
let mut to_skip = n - 1;
|
||||
let mut to_skip = n;
|
||||
while to_skip > 0 {
|
||||
if let Err(e) = SortedDocumentsIterator::update_current(
|
||||
current_child,
|
||||
@@ -108,7 +108,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
|
||||
continue;
|
||||
} else {
|
||||
// The current iterator is large enough, so we can forward the call to it.
|
||||
return inner.nth(to_skip + 1);
|
||||
return inner.nth(to_skip);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,8 @@ use crate::{
|
||||
pub struct Metadata {
|
||||
/// The weight as defined in the FieldidsWeightsMap of the searchable attribute if it is searchable.
|
||||
pub searchable: Option<Weight>,
|
||||
/// The field is part of the exact attributes.
|
||||
pub exact: bool,
|
||||
/// The field is part of the sortable attributes.
|
||||
pub sortable: bool,
|
||||
/// The field is defined as the distinct attribute.
|
||||
@@ -209,6 +211,7 @@ impl Metadata {
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetadataBuilder {
|
||||
searchable_attributes: Option<Vec<String>>,
|
||||
exact_searchable_attributes: Vec<String>,
|
||||
filterable_attributes: Vec<FilterableAttributesRule>,
|
||||
sortable_attributes: HashSet<String>,
|
||||
localized_attributes: Option<Vec<LocalizedAttributesRule>>,
|
||||
@@ -220,15 +223,18 @@ impl MetadataBuilder {
|
||||
pub fn from_index(index: &Index, rtxn: &RoTxn) -> Result<Self> {
|
||||
let searchable_attributes = index
|
||||
.user_defined_searchable_fields(rtxn)?
|
||||
.map(|fields| fields.into_iter().map(|s| s.to_string()).collect());
|
||||
.map(|fields| fields.into_iter().map(String::from).collect());
|
||||
let exact_searchable_attributes =
|
||||
index.exact_attributes(rtxn)?.into_iter().map(String::from).collect();
|
||||
let filterable_attributes = index.filterable_attributes_rules(rtxn)?;
|
||||
let sortable_attributes = index.sortable_fields(rtxn)?;
|
||||
let localized_attributes = index.localized_attributes_rules(rtxn)?;
|
||||
let distinct_attribute = index.distinct_field(rtxn)?.map(|s| s.to_string());
|
||||
let distinct_attribute = index.distinct_field(rtxn)?.map(String::from);
|
||||
let asc_desc_attributes = index.asc_desc_fields(rtxn)?;
|
||||
|
||||
Ok(Self::new(
|
||||
searchable_attributes,
|
||||
exact_searchable_attributes,
|
||||
filterable_attributes,
|
||||
sortable_attributes,
|
||||
localized_attributes,
|
||||
@@ -242,6 +248,7 @@ impl MetadataBuilder {
|
||||
/// This is used for testing, prefer using `MetadataBuilder::from_index` instead.
|
||||
pub fn new(
|
||||
searchable_attributes: Option<Vec<String>>,
|
||||
exact_searchable_attributes: Vec<String>,
|
||||
filterable_attributes: Vec<FilterableAttributesRule>,
|
||||
sortable_attributes: HashSet<String>,
|
||||
localized_attributes: Option<Vec<LocalizedAttributesRule>>,
|
||||
@@ -256,6 +263,7 @@ impl MetadataBuilder {
|
||||
|
||||
Self {
|
||||
searchable_attributes,
|
||||
exact_searchable_attributes,
|
||||
filterable_attributes,
|
||||
sortable_attributes,
|
||||
localized_attributes,
|
||||
@@ -269,6 +277,7 @@ impl MetadataBuilder {
|
||||
// Vectors fields are not searchable, filterable, distinct or asc_desc
|
||||
return Metadata {
|
||||
searchable: None,
|
||||
exact: false,
|
||||
sortable: false,
|
||||
distinct: false,
|
||||
asc_desc: false,
|
||||
@@ -296,6 +305,7 @@ impl MetadataBuilder {
|
||||
// Geo fields are not searchable, distinct or asc_desc
|
||||
return Metadata {
|
||||
searchable: None,
|
||||
exact: false,
|
||||
sortable,
|
||||
distinct: false,
|
||||
asc_desc: false,
|
||||
@@ -309,6 +319,7 @@ impl MetadataBuilder {
|
||||
debug_assert!(!sortable, "geojson fields should not be sortable");
|
||||
return Metadata {
|
||||
searchable: None,
|
||||
exact: false,
|
||||
sortable,
|
||||
distinct: false,
|
||||
asc_desc: false,
|
||||
@@ -329,6 +340,8 @@ impl MetadataBuilder {
|
||||
None => Some(0),
|
||||
};
|
||||
|
||||
let exact = self.exact_searchable_attributes.iter().any(|attr| is_faceted_by(field, attr));
|
||||
|
||||
let distinct =
|
||||
self.distinct_attribute.as_ref().is_some_and(|distinct_field| field == distinct_field);
|
||||
let asc_desc = self.asc_desc_attributes.contains(field);
|
||||
@@ -343,6 +356,7 @@ impl MetadataBuilder {
|
||||
|
||||
Metadata {
|
||||
searchable,
|
||||
exact,
|
||||
sortable,
|
||||
distinct,
|
||||
asc_desc,
|
||||
|
||||
@@ -8,17 +8,26 @@ use bumpalo::Bump;
|
||||
|
||||
use super::match_searchable_field;
|
||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
use crate::attribute_patterns::match_field_legacy;
|
||||
use crate::fields_ids_map::metadata::Metadata;
|
||||
use crate::update::new::document::DocumentContext;
|
||||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::extract::perm_json_p::contained_in;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::indexer::settings_changes::{
|
||||
settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
use crate::update::new::{DocumentChange, DocumentIdentifiers};
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::{
|
||||
bucketed_position, DocumentId, FieldId, PatternMatch, Result, UserError,
|
||||
MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
@@ -34,6 +43,15 @@ pub struct WordDocidsBalancedCaches<'extractor> {
|
||||
|
||||
unsafe impl MostlySend for WordDocidsBalancedCaches<'_> {}
|
||||
|
||||
/// Whether to extract or skip fields during word extraction.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
enum FieldDbExtraction {
|
||||
/// Extract the word and put it in to the fid-based databases.
|
||||
Extract,
|
||||
/// Do not store the word in the fid-based databases.
|
||||
Skip,
|
||||
}
|
||||
|
||||
impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||
pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
|
||||
Self {
|
||||
@@ -47,12 +65,14 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn insert_add_u32(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
position: u16,
|
||||
word: &str,
|
||||
exact: bool,
|
||||
field_db_extraction: FieldDbExtraction,
|
||||
docid: u32,
|
||||
bump: &Bump,
|
||||
) -> Result<()> {
|
||||
@@ -66,11 +86,13 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
|
||||
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_fid_docids.insert_add_u32(&buffer, docid)?;
|
||||
if field_db_extraction == FieldDbExtraction::Extract {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_fid_docids.insert_add_u32(&buffer, docid)?;
|
||||
}
|
||||
|
||||
let position = bucketed_position(position);
|
||||
buffer.clear();
|
||||
@@ -83,21 +105,26 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||
self.flush_fid_word_count(&mut buffer)?;
|
||||
}
|
||||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1)
|
||||
.or_insert((None, Some(1)));
|
||||
if field_db_extraction == FieldDbExtraction::Extract {
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1)
|
||||
.or_insert((None, Some(1)));
|
||||
}
|
||||
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn insert_del_u32(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
position: u16,
|
||||
word: &str,
|
||||
exact: bool,
|
||||
field_db_extraction: FieldDbExtraction,
|
||||
docid: u32,
|
||||
bump: &Bump,
|
||||
) -> Result<()> {
|
||||
@@ -111,11 +138,13 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
|
||||
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_fid_docids.insert_del_u32(&buffer, docid)?;
|
||||
if field_db_extraction == FieldDbExtraction::Extract {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_fid_docids.insert_del_u32(&buffer, docid)?;
|
||||
}
|
||||
|
||||
let position = bucketed_position(position);
|
||||
buffer.clear();
|
||||
@@ -128,10 +157,12 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||
self.flush_fid_word_count(&mut buffer)?;
|
||||
}
|
||||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1)
|
||||
.or_insert((Some(1), None));
|
||||
if field_db_extraction == FieldDbExtraction::Extract {
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1)
|
||||
.or_insert((Some(1), None));
|
||||
}
|
||||
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
@@ -325,6 +356,24 @@ impl WordDocidsExtractors {
|
||||
exact_attributes.iter().any(|attr| contained_in(fname, attr))
|
||||
|| disabled_typos_terms.is_exact(word)
|
||||
};
|
||||
|
||||
let mut should_tokenize = |field_name: &str| {
|
||||
let Some((field_id, meta)) = new_fields_ids_map.id_with_metadata_or_insert(field_name)
|
||||
else {
|
||||
return Err(UserError::AttributeLimitReached.into());
|
||||
};
|
||||
|
||||
let pattern_match = if meta.is_searchable() {
|
||||
PatternMatch::Match
|
||||
} else {
|
||||
// TODO: should be a match on the field_name using `match_field_legacy` function,
|
||||
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
|
||||
PatternMatch::Parent
|
||||
};
|
||||
|
||||
Ok((field_id, pattern_match))
|
||||
};
|
||||
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
@@ -333,13 +382,14 @@ impl WordDocidsExtractors {
|
||||
pos,
|
||||
word,
|
||||
is_exact(fname, word),
|
||||
FieldDbExtraction::Extract,
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map,
|
||||
&mut should_tokenize,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
@@ -361,13 +411,14 @@ impl WordDocidsExtractors {
|
||||
pos,
|
||||
word,
|
||||
is_exact(fname, word),
|
||||
FieldDbExtraction::Extract,
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map,
|
||||
&mut should_tokenize,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
@@ -377,13 +428,14 @@ impl WordDocidsExtractors {
|
||||
pos,
|
||||
word,
|
||||
is_exact(fname, word),
|
||||
FieldDbExtraction::Extract,
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.merged(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map,
|
||||
&mut should_tokenize,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
@@ -394,13 +446,14 @@ impl WordDocidsExtractors {
|
||||
pos,
|
||||
word,
|
||||
is_exact(fname, word),
|
||||
FieldDbExtraction::Extract,
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.inserted(),
|
||||
new_fields_ids_map,
|
||||
&mut should_tokenize,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
@@ -411,3 +464,300 @@ impl WordDocidsExtractors {
|
||||
cached_sorter.flush_fid_word_count(&mut buffer)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordDocidsSettingsExtractorsData<'a, SD> {
|
||||
tokenizer: DocumentTokenizer<'a>,
|
||||
max_memory_by_thread: Option<usize>,
|
||||
buckets: usize,
|
||||
settings_delta: &'a SD,
|
||||
}
|
||||
|
||||
impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
|
||||
for WordDocidsSettingsExtractorsData<'_, SD>
|
||||
{
|
||||
type Data = RefCell<Option<WordDocidsBalancedCaches<'extractor>>>;
|
||||
|
||||
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result<Self::Data> {
|
||||
Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.max_memory_by_thread,
|
||||
extractor_alloc,
|
||||
))))
|
||||
}
|
||||
|
||||
fn process<'doc>(
|
||||
&'doc self,
|
||||
documents: impl Iterator<Item = crate::Result<DocumentIdentifiers<'doc>>>,
|
||||
context: &'doc DocumentContext<Self::Data>,
|
||||
) -> crate::Result<()> {
|
||||
for document in documents {
|
||||
let document = document?;
|
||||
SettingsChangeWordDocidsExtractors::extract_settings_change(
|
||||
document,
|
||||
context,
|
||||
&self.tokenizer,
|
||||
self.settings_delta,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SettingsChangeWordDocidsExtractors;
|
||||
|
||||
impl SettingsChangeWordDocidsExtractors {
|
||||
pub fn run_extraction<'fid, 'indexer, 'index, 'extractor, SD, MSP>(
|
||||
settings_delta: &SD,
|
||||
documents: &'indexer DocumentsIndentifiers<'indexer>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: IndexingStep,
|
||||
) -> Result<WordDocidsCaches<'extractor>>
|
||||
where
|
||||
SD: SettingsDelta + Sync,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
// Warning: this is duplicated code from extract_word_pair_proximity_docids.rs
|
||||
// TODO we need to read the new AND old settings to support changing global parameters
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let stop_words = indexing_context.index.stop_words(&rtxn)?;
|
||||
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
|
||||
let allowed_separators: Option<Vec<_>> =
|
||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary = indexing_context.index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.build();
|
||||
let localized_attributes_rules =
|
||||
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tokenizer,
|
||||
localized_attributes_rules: &localized_attributes_rules,
|
||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
let extractor_data = WordDocidsSettingsExtractorsData {
|
||||
tokenizer: document_tokenizer,
|
||||
max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
|
||||
buckets: rayon::current_num_threads(),
|
||||
settings_delta,
|
||||
};
|
||||
let datastore = ThreadLocal::new();
|
||||
{
|
||||
let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
settings_change_extract(
|
||||
documents,
|
||||
&extractor_data,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
step,
|
||||
)?;
|
||||
}
|
||||
|
||||
let mut merger = WordDocidsCaches::new();
|
||||
for cache in datastore.into_iter().flat_map(RefCell::into_inner) {
|
||||
merger.push(cache)?;
|
||||
}
|
||||
|
||||
Ok(merger)
|
||||
}
|
||||
|
||||
// TODO find a better name (extract_document_change?)
|
||||
// and document this method.
|
||||
fn extract_settings_change<SD: SettingsDelta>(
|
||||
document: DocumentIdentifiers<'_>,
|
||||
context: &DocumentContext<RefCell<Option<WordDocidsBalancedCaches>>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
settings_delta: &SD,
|
||||
) -> Result<()> {
|
||||
let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
|
||||
let cached_sorter = cached_sorter_ref.as_mut().unwrap();
|
||||
let doc_alloc = &context.doc_alloc;
|
||||
|
||||
let new_fields_ids_map = settings_delta.new_fields_ids_map();
|
||||
let old_fields_ids_map = context.index.fields_ids_map_with_metadata(&context.rtxn)?;
|
||||
let old_searchable = settings_delta.old_searchable_attributes().as_ref();
|
||||
let new_searchable = settings_delta.new_searchable_attributes().as_ref();
|
||||
|
||||
let current_document = document.current(
|
||||
&context.rtxn,
|
||||
context.index,
|
||||
old_fields_ids_map.as_fields_ids_map(),
|
||||
)?;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum ActionToOperate {
|
||||
ReindexAllFields,
|
||||
// TODO improve by listing field prefixes
|
||||
IndexAddedFields,
|
||||
SkipDocument,
|
||||
}
|
||||
|
||||
let mut action = ActionToOperate::SkipDocument;
|
||||
// Here we do a preliminary check to determine the action to take.
|
||||
// This check doesn't trigger the tokenizer as we never return
|
||||
// PatternMatch::Match.
|
||||
document_tokenizer.tokenize_document(
|
||||
current_document,
|
||||
&mut |field_name| {
|
||||
let fid = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
|
||||
|
||||
if action == ActionToOperate::ReindexAllFields {
|
||||
return Ok((fid, PatternMatch::NoMatch));
|
||||
}
|
||||
|
||||
let old_field_metadata = old_fields_ids_map.metadata(fid).unwrap();
|
||||
let new_field_metadata = new_fields_ids_map.metadata(fid).unwrap();
|
||||
|
||||
action = match (old_field_metadata, new_field_metadata) {
|
||||
(Metadata { exact: old_exact, .. }, Metadata { exact: new_exact, .. })
|
||||
if old_exact != new_exact =>
|
||||
{
|
||||
ActionToOperate::ReindexAllFields
|
||||
}
|
||||
(Metadata { searchable: Some(_), .. }, Metadata { searchable: None, .. }) => {
|
||||
ActionToOperate::ReindexAllFields
|
||||
}
|
||||
(Metadata { searchable: None, .. }, Metadata { searchable: Some(_), .. }) => {
|
||||
ActionToOperate::IndexAddedFields
|
||||
}
|
||||
_ => action,
|
||||
};
|
||||
|
||||
Ok((fid, PatternMatch::Parent))
|
||||
},
|
||||
&mut |_, _, _, _| Ok(()),
|
||||
)?;
|
||||
|
||||
// Early return when we don't need to index the document
|
||||
if action == ActionToOperate::SkipDocument {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut should_tokenize = |field_name: &str| {
|
||||
let field_id = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
|
||||
let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
|
||||
let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
|
||||
|
||||
let pattern_match = match action {
|
||||
ActionToOperate::ReindexAllFields => {
|
||||
if old_field_metadata.is_searchable() || new_field_metadata.is_searchable() {
|
||||
PatternMatch::Match
|
||||
// If any old or new field is searchable then we need to iterate over all fields
|
||||
// else if any field matches we need to iterate over all fields
|
||||
} else if old_searchable.zip(new_searchable).is_none_or(|(old, new)| {
|
||||
old.iter().chain(new).any(|attr| {
|
||||
match_field_legacy(attr, field_name) == PatternMatch::Parent
|
||||
})
|
||||
}) {
|
||||
PatternMatch::Parent
|
||||
} else {
|
||||
PatternMatch::NoMatch
|
||||
}
|
||||
}
|
||||
ActionToOperate::IndexAddedFields => {
|
||||
let has_searchable_children =
|
||||
|field_name: &str, searchable: Option<&Vec<String>>| {
|
||||
searchable.is_none_or(|fields| {
|
||||
fields.iter().any(|attr| {
|
||||
match_field_legacy(attr, field_name) != PatternMatch::Parent
|
||||
})
|
||||
})
|
||||
};
|
||||
|
||||
// Was not searchable but now is
|
||||
if !old_field_metadata.is_searchable() && new_field_metadata.is_searchable() {
|
||||
PatternMatch::Match
|
||||
// If the field was not a parent of a searchable before and is now
|
||||
} else if !has_searchable_children(field_name, old_searchable)
|
||||
&& has_searchable_children(field_name, new_searchable)
|
||||
{
|
||||
PatternMatch::Parent
|
||||
} else {
|
||||
PatternMatch::NoMatch
|
||||
}
|
||||
}
|
||||
ActionToOperate::SkipDocument => unreachable!(),
|
||||
};
|
||||
|
||||
Ok((field_id, pattern_match))
|
||||
};
|
||||
|
||||
let old_disabled_typos_terms = settings_delta.old_disabled_typos_terms();
|
||||
let new_disabled_typos_terms = settings_delta.new_disabled_typos_terms();
|
||||
let mut token_fn = |_field_name: &str, field_id, pos, word: &str| {
|
||||
let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
|
||||
let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
|
||||
|
||||
match (old_field_metadata, new_field_metadata) {
|
||||
(
|
||||
Metadata { searchable: Some(_), exact: old_exact, .. },
|
||||
Metadata { searchable: None, .. },
|
||||
) => cached_sorter.insert_del_u32(
|
||||
field_id,
|
||||
pos,
|
||||
word,
|
||||
old_exact || old_disabled_typos_terms.is_exact(word),
|
||||
// We deleted the field globally
|
||||
FieldDbExtraction::Skip,
|
||||
document.docid(),
|
||||
doc_alloc,
|
||||
),
|
||||
(
|
||||
Metadata { searchable: None, .. },
|
||||
Metadata { searchable: Some(_), exact: new_exact, .. },
|
||||
) => cached_sorter.insert_add_u32(
|
||||
field_id,
|
||||
pos,
|
||||
word,
|
||||
new_exact || new_disabled_typos_terms.is_exact(word),
|
||||
FieldDbExtraction::Extract,
|
||||
document.docid(),
|
||||
doc_alloc,
|
||||
),
|
||||
(Metadata { searchable: None, .. }, Metadata { searchable: None, .. }) => {
|
||||
unreachable!()
|
||||
}
|
||||
(Metadata { exact: old_exact, .. }, Metadata { exact: new_exact, .. }) => {
|
||||
cached_sorter.insert_del_u32(
|
||||
field_id,
|
||||
pos,
|
||||
word,
|
||||
old_exact || old_disabled_typos_terms.is_exact(word),
|
||||
// The field has already been extracted
|
||||
FieldDbExtraction::Skip,
|
||||
document.docid(),
|
||||
doc_alloc,
|
||||
)?;
|
||||
cached_sorter.insert_add_u32(
|
||||
field_id,
|
||||
pos,
|
||||
word,
|
||||
new_exact || new_disabled_typos_terms.is_exact(word),
|
||||
// The field has already been extracted
|
||||
FieldDbExtraction::Skip,
|
||||
document.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// TODO we must tokenize twice when we change global parameters like stop words,
|
||||
// the language settings, dictionary, separators, non-separators...
|
||||
document_tokenizer.tokenize_document(
|
||||
current_document,
|
||||
&mut should_tokenize,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,9 @@ use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
use crate::{
|
||||
FieldId, GlobalFieldsIdsMap, PatternMatch, Result, UserError, MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
pub struct WordPairProximityDocidsExtractorData<'a> {
|
||||
tokenizer: DocumentTokenizer<'a>,
|
||||
@@ -279,7 +281,24 @@ fn process_document_tokens<'doc>(
|
||||
word_positions.push_back((Rc::from(word), pos));
|
||||
Ok(())
|
||||
};
|
||||
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
|
||||
|
||||
let mut should_tokenize = |field_name: &str| {
|
||||
let Some((field_id, meta)) = fields_ids_map.id_with_metadata_or_insert(field_name) else {
|
||||
return Err(UserError::AttributeLimitReached.into());
|
||||
};
|
||||
|
||||
let pattern_match = if meta.is_searchable() {
|
||||
PatternMatch::Match
|
||||
} else {
|
||||
// TODO: should be a match on the field_name using `match_field_legacy` function,
|
||||
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
|
||||
PatternMatch::Parent
|
||||
};
|
||||
|
||||
Ok((field_id, pattern_match))
|
||||
};
|
||||
|
||||
document_tokenizer.tokenize_document(document, &mut should_tokenize, &mut token_fn)?;
|
||||
|
||||
drain_word_positions(word_positions, word_pair_proximity);
|
||||
Ok(())
|
||||
|
||||
@@ -2,6 +2,7 @@ mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod tokenize_document;
|
||||
|
||||
pub use extract_word_docids::SettingsChangeWordDocidsExtractors;
|
||||
pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
|
||||
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||
|
||||
|
||||
@@ -8,10 +8,7 @@ use crate::update::new::document::Document;
|
||||
use crate::update::new::extract::perm_json_p::{
|
||||
seek_leaf_values_in_array, seek_leaf_values_in_object, Depth,
|
||||
};
|
||||
use crate::{
|
||||
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
|
||||
MAX_WORD_LENGTH,
|
||||
};
|
||||
use crate::{FieldId, InternalError, LocalizedAttributesRule, Result, MAX_WORD_LENGTH};
|
||||
|
||||
// todo: should be crate::proximity::MAX_DISTANCE but it has been forgotten
|
||||
const MAX_DISTANCE: u32 = 8;
|
||||
@@ -26,22 +23,16 @@ impl DocumentTokenizer<'_> {
|
||||
pub fn tokenize_document<'doc>(
|
||||
&self,
|
||||
document: impl Document<'doc>,
|
||||
field_id_map: &mut GlobalFieldsIdsMap,
|
||||
should_tokenize: &mut impl FnMut(&str) -> Result<(FieldId, PatternMatch)>,
|
||||
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut field_position = HashMap::new();
|
||||
let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
|
||||
let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else {
|
||||
return Err(UserError::AttributeLimitReached.into());
|
||||
};
|
||||
|
||||
if meta.is_searchable() {
|
||||
let (field_id, pattern_match) = should_tokenize(field_name)?;
|
||||
if pattern_match == PatternMatch::Match {
|
||||
self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?;
|
||||
}
|
||||
|
||||
// todo: should be a match on the field_name using `match_field_legacy` function,
|
||||
// but for legacy reasons we iterate over all the fields to fill the field_id_map.
|
||||
Ok(PatternMatch::Match)
|
||||
Ok(pattern_match)
|
||||
};
|
||||
|
||||
for entry in document.iter_top_level_fields() {
|
||||
@@ -192,7 +183,7 @@ mod test {
|
||||
use super::*;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::update::new::document::{DocumentFromVersions, Versions};
|
||||
use crate::FieldsIdsMap;
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, UserError};
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_document() {
|
||||
@@ -231,6 +222,7 @@ mod test {
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
None,
|
||||
None,
|
||||
Default::default(),
|
||||
@@ -251,15 +243,19 @@ mod test {
|
||||
let document = Versions::single(document);
|
||||
let document = DocumentFromVersions::new(&document);
|
||||
|
||||
let mut should_tokenize = |field_name: &str| {
|
||||
let Some(field_id) = global_fields_ids_map.id_or_insert(field_name) else {
|
||||
return Err(UserError::AttributeLimitReached.into());
|
||||
};
|
||||
|
||||
Ok((field_id, PatternMatch::Match))
|
||||
};
|
||||
|
||||
document_tokenizer
|
||||
.tokenize_document(
|
||||
document,
|
||||
&mut global_fields_ids_map,
|
||||
&mut |_fname, fid, pos, word| {
|
||||
words.insert([fid, pos], word.to_string());
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.tokenize_document(document, &mut should_tokenize, &mut |_fname, fid, pos, word| {
|
||||
words.insert([fid, pos], word.to_string());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
snapshot!(format!("{:#?}", words), @r###"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::cell::RefCell;
|
||||
use std::fmt::Debug;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use bumpalo::collections::Vec as BVec;
|
||||
use bumpalo::Bump;
|
||||
@@ -27,7 +28,10 @@ use crate::vector::extractor::{
|
||||
use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed};
|
||||
use crate::vector::settings::ReindexAction;
|
||||
use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment};
|
||||
use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError};
|
||||
use crate::{
|
||||
DocumentId, FieldDistribution, GlobalFieldsIdsMap, InternalError, Result, ThreadPoolNoAbort,
|
||||
UserError,
|
||||
};
|
||||
|
||||
pub struct EmbeddingExtractor<'a, 'b> {
|
||||
embedders: &'a RuntimeEmbedders,
|
||||
@@ -321,6 +325,15 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
|
||||
let old_embedders = self.settings_delta.old_embedders();
|
||||
let unused_vectors_distribution = UnusedVectorsDistributionBump::new_in(&context.doc_alloc);
|
||||
|
||||
// We get a reference to the new and old fields ids maps but
|
||||
// note that those are local versions where updates to them
|
||||
// will not be reflected in the database. It's not an issue
|
||||
// because new settings do not generate new fields.
|
||||
let new_fields_ids_map = RwLock::new(self.settings_delta.new_fields_ids_map().clone());
|
||||
let new_fields_ids_map = RefCell::new(GlobalFieldsIdsMap::new(&new_fields_ids_map));
|
||||
let old_fields_ids_map = RwLock::new(self.settings_delta.old_fields_ids_map().clone());
|
||||
let old_fields_ids_map = RefCell::new(GlobalFieldsIdsMap::new(&old_fields_ids_map));
|
||||
|
||||
let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
|
||||
let embedder_configs = context.index.embedding_configs();
|
||||
for (embedder_name, action) in self.settings_delta.embedder_actions().iter() {
|
||||
@@ -396,6 +409,7 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
|
||||
if !must_regenerate {
|
||||
continue;
|
||||
}
|
||||
|
||||
// we need to regenerate the prompts for the document
|
||||
chunks.settings_change_autogenerated(
|
||||
document.docid(),
|
||||
@@ -406,7 +420,8 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
|
||||
context.db_fields_ids_map,
|
||||
)?,
|
||||
self.settings_delta,
|
||||
context.new_fields_ids_map,
|
||||
&old_fields_ids_map,
|
||||
&new_fields_ids_map,
|
||||
&unused_vectors_distribution,
|
||||
old_is_user_provided,
|
||||
fragments_changed,
|
||||
@@ -442,7 +457,8 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor>
|
||||
context.db_fields_ids_map,
|
||||
)?,
|
||||
self.settings_delta,
|
||||
context.new_fields_ids_map,
|
||||
&old_fields_ids_map,
|
||||
&new_fields_ids_map,
|
||||
&unused_vectors_distribution,
|
||||
old_is_user_provided,
|
||||
true,
|
||||
@@ -638,7 +654,8 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
external_docid: &'a str,
|
||||
document: D,
|
||||
settings_delta: &SD,
|
||||
fields_ids_map: &'a RefCell<crate::GlobalFieldsIdsMap>,
|
||||
old_fields_ids_map: &'a RefCell<GlobalFieldsIdsMap<'a>>,
|
||||
new_fields_ids_map: &'a RefCell<GlobalFieldsIdsMap<'a>>,
|
||||
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
||||
old_is_user_provided: bool,
|
||||
full_reindex: bool,
|
||||
@@ -733,10 +750,17 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
old_embedder.as_ref().map(|old_embedder| &old_embedder.document_template)
|
||||
};
|
||||
|
||||
let extractor =
|
||||
DocumentTemplateExtractor::new(document_template, doc_alloc, fields_ids_map);
|
||||
let extractor = DocumentTemplateExtractor::new(
|
||||
document_template,
|
||||
doc_alloc,
|
||||
new_fields_ids_map,
|
||||
);
|
||||
let old_extractor = old_document_template.map(|old_document_template| {
|
||||
DocumentTemplateExtractor::new(old_document_template, doc_alloc, fields_ids_map)
|
||||
DocumentTemplateExtractor::new(
|
||||
old_document_template,
|
||||
doc_alloc,
|
||||
old_fields_ids_map,
|
||||
)
|
||||
});
|
||||
let metadata =
|
||||
Metadata { docid, external_docid, extractor_id: extractor.extractor_id() };
|
||||
|
||||
@@ -372,11 +372,10 @@ where
|
||||
SD: SettingsDelta + Sync,
|
||||
{
|
||||
// Create the list of document ids to extract
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let all_document_ids =
|
||||
indexing_context.index.documents_ids(&rtxn)?.into_iter().collect::<Vec<_>>();
|
||||
let primary_key =
|
||||
primary_key_from_db(indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?;
|
||||
let index = indexing_context.index;
|
||||
let rtxn = index.read_txn()?;
|
||||
let all_document_ids = index.documents_ids(&rtxn)?.into_iter().collect::<Vec<_>>();
|
||||
let primary_key = primary_key_from_db(index, &rtxn, &indexing_context.db_fields_ids_map)?;
|
||||
let documents = DocumentsIndentifiers::new(&all_document_ids, primary_key);
|
||||
|
||||
let span =
|
||||
@@ -391,6 +390,102 @@ where
|
||||
extractor_allocs,
|
||||
)?;
|
||||
|
||||
'_word_docids: {
|
||||
let WordDocidsCaches {
|
||||
word_docids,
|
||||
word_fid_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
} = {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
||||
let _entered = span.enter();
|
||||
SettingsChangeWordDocidsExtractors::run_extraction(
|
||||
settings_delta,
|
||||
&documents,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
IndexingStep::ExtractingWords,
|
||||
)?
|
||||
};
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::MergingWordCaches);
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::WordDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
word_docids,
|
||||
index.word_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::WordFieldIdDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
word_fid_docids,
|
||||
index.word_fid_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordFidDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::ExactWordDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
exact_word_docids,
|
||||
index.exact_word_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<ExactWordDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::WordPositionDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
word_position_docids,
|
||||
index.word_position_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<WordPositionDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
indexing_context.progress.update_progress(MergingWordCache::FieldIdWordCountDocids);
|
||||
|
||||
merge_and_send_docids(
|
||||
fid_word_count_docids,
|
||||
index.field_id_word_count_docids.remap_types(),
|
||||
index,
|
||||
extractor_sender.docids::<FidWordCountDocids>(),
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
};
|
||||
|
||||
'vectors: {
|
||||
if settings_delta.embedder_actions().is_empty() {
|
||||
break 'vectors;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::{Arc, Once, RwLock};
|
||||
use std::thread::{self, Builder};
|
||||
@@ -8,9 +8,11 @@ use document_changes::{DocumentChanges, IndexingContext};
|
||||
pub use document_deletion::DocumentDeletion;
|
||||
pub use document_operation::{DocumentOperation, PayloadStats};
|
||||
use hashbrown::HashMap;
|
||||
use heed::types::DecodeIgnore;
|
||||
use heed::{RoTxn, RwTxn};
|
||||
pub use partial_dump::PartialDump;
|
||||
pub use post_processing::recompute_word_fst_from_word_docids_database;
|
||||
pub use settings_changes::settings_change_extract;
|
||||
pub use update_by_function::UpdateByFunction;
|
||||
pub use write::ChannelCongestion;
|
||||
use write::{build_vectors, update_index, write_to_db};
|
||||
@@ -21,11 +23,15 @@ use super::thread_local::ThreadLocal;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::progress::{EmbedderStats, Progress};
|
||||
use crate::update::new::steps::SettingsIndexerStep;
|
||||
use crate::update::new::FacetFieldIdsDelta;
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments};
|
||||
use crate::vector::{Embedder, RuntimeEmbedders, VectorStore};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
|
||||
use crate::{
|
||||
Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort,
|
||||
};
|
||||
|
||||
pub(crate) mod de;
|
||||
pub mod document_changes;
|
||||
@@ -235,6 +241,12 @@ where
|
||||
SD: SettingsDelta + Sync,
|
||||
{
|
||||
delete_old_embedders_and_fragments(wtxn, index, settings_delta)?;
|
||||
delete_old_fid_based_databases(wtxn, index, settings_delta, must_stop_processing, progress)?;
|
||||
|
||||
// TODO delete useless searchable databases
|
||||
// - Clear word_pair_proximity if byWord to byAttribute
|
||||
// - Clear fid_prefix_* in the post processing
|
||||
// - clear the prefix + fid_prefix if setting `PrefixSearch` is enabled
|
||||
|
||||
let mut bbbuffers = Vec::new();
|
||||
let finished_extraction = AtomicBool::new(false);
|
||||
@@ -293,6 +305,8 @@ where
|
||||
.unwrap()
|
||||
})?;
|
||||
|
||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
||||
|
||||
let new_embedders = settings_delta.new_embedders();
|
||||
let embedder_actions = settings_delta.embedder_actions();
|
||||
let index_embedder_category_ids = settings_delta.new_embedder_category_id();
|
||||
@@ -327,6 +341,18 @@ where
|
||||
})
|
||||
.unwrap()?;
|
||||
|
||||
pool.install(|| {
|
||||
// WARN When implementing the facets don't forget this
|
||||
let facet_field_ids_delta = FacetFieldIdsDelta::new(0, 0);
|
||||
post_processing::post_process(
|
||||
indexing_context,
|
||||
wtxn,
|
||||
global_fields_ids_map,
|
||||
facet_field_ids_delta,
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::BuildingGeoJson);
|
||||
index.cellulite.build(
|
||||
wtxn,
|
||||
@@ -456,6 +482,98 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Deletes entries refering the provided
|
||||
/// fids from the fid-based databases.
|
||||
fn delete_old_fid_based_databases<SD, MSP>(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
index: &Index,
|
||||
settings_delta: &SD,
|
||||
must_stop_processing: &MSP,
|
||||
progress: &Progress,
|
||||
) -> Result<()>
|
||||
where
|
||||
SD: SettingsDelta + Sync,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
let fids_to_delete: Option<BTreeSet<_>> = {
|
||||
let rtxn = index.read_txn()?;
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let old_searchable_attributes = settings_delta.old_searchable_attributes().as_ref();
|
||||
let new_searchable_attributes = settings_delta.new_searchable_attributes().as_ref();
|
||||
old_searchable_attributes.zip(new_searchable_attributes).map(|(old, new)| {
|
||||
old.iter()
|
||||
// Ignore the field if it is not searchable anymore
|
||||
// or if it was never referenced in any document
|
||||
.filter_map(|name| if new.contains(name) { None } else { fields_ids_map.id(name) })
|
||||
.collect()
|
||||
})
|
||||
};
|
||||
|
||||
let fids_to_delete = match fids_to_delete {
|
||||
Some(fids) => fids,
|
||||
None => return Ok(()),
|
||||
};
|
||||
|
||||
progress.update_progress(SettingsIndexerStep::DeletingOldWordFidDocids);
|
||||
delete_old_word_fid_docids(wtxn, index, must_stop_processing, &fids_to_delete)?;
|
||||
|
||||
progress.update_progress(SettingsIndexerStep::DeletingOldFidWordCountDocids);
|
||||
delete_old_fid_word_count_docids(wtxn, index, must_stop_processing, fids_to_delete)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_old_word_fid_docids<MSP>(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
index: &Index,
|
||||
must_stop_processing: &MSP,
|
||||
fids_to_delete: &BTreeSet<u16>,
|
||||
) -> Result<(), Error>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
let mut iter = index.word_fid_docids.iter_mut(wtxn)?.remap_data_type::<DecodeIgnore>();
|
||||
while let Some(((_word, fid), ())) = iter.next().transpose()? {
|
||||
// TODO should I call it that often?
|
||||
if must_stop_processing() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
if fids_to_delete.contains(&fid) {
|
||||
// safety: We don't keep any references to the data.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_old_fid_word_count_docids<MSP>(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
index: &Index,
|
||||
must_stop_processing: &MSP,
|
||||
fids_to_delete: BTreeSet<u16>,
|
||||
) -> Result<(), Error>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
let db = index.field_id_word_count_docids.remap_data_type::<DecodeIgnore>();
|
||||
for fid_to_delete in fids_to_delete {
|
||||
if must_stop_processing() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
let mut iter = db.prefix_iter_mut(wtxn, &(fid_to_delete, 0))?;
|
||||
while let Some(((fid, _word_count), ())) = iter.next().transpose()? {
|
||||
debug_assert_eq!(fid, fid_to_delete);
|
||||
// safety: We don't keep any references to the data.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn indexer_memory_settings(
|
||||
current_num_threads: usize,
|
||||
grenad_parameters: GrenadParameters,
|
||||
|
||||
@@ -28,6 +28,8 @@ make_enum_progress! {
|
||||
ChangingVectorStore,
|
||||
UsingStableIndexer,
|
||||
UsingExperimentalIndexer,
|
||||
DeletingOldWordFidDocids,
|
||||
DeletingOldFidWordCountDocids,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1589,33 +1589,33 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
||||
// only use the new indexer when only the embedder possibly changed
|
||||
if let Self {
|
||||
searchable_fields: Setting::NotSet,
|
||||
searchable_fields: _,
|
||||
displayed_fields: Setting::NotSet,
|
||||
filterable_fields: Setting::NotSet,
|
||||
sortable_fields: Setting::NotSet,
|
||||
criteria: Setting::NotSet,
|
||||
stop_words: Setting::NotSet,
|
||||
non_separator_tokens: Setting::NotSet,
|
||||
separator_tokens: Setting::NotSet,
|
||||
dictionary: Setting::NotSet,
|
||||
stop_words: Setting::NotSet, // TODO (require force reindexing of searchables)
|
||||
non_separator_tokens: Setting::NotSet, // TODO (require force reindexing of searchables)
|
||||
separator_tokens: Setting::NotSet, // TODO (require force reindexing of searchables)
|
||||
dictionary: Setting::NotSet, // TODO (require force reindexing of searchables)
|
||||
distinct_field: Setting::NotSet,
|
||||
synonyms: Setting::NotSet,
|
||||
primary_key: Setting::NotSet,
|
||||
authorize_typos: Setting::NotSet,
|
||||
min_word_len_two_typos: Setting::NotSet,
|
||||
min_word_len_one_typo: Setting::NotSet,
|
||||
exact_words: Setting::NotSet,
|
||||
exact_attributes: Setting::NotSet,
|
||||
exact_words: Setting::NotSet, // TODO (require force reindexing of searchables)
|
||||
exact_attributes: _,
|
||||
max_values_per_facet: Setting::NotSet,
|
||||
sort_facet_values_by: Setting::NotSet,
|
||||
pagination_max_total_hits: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
embedder_settings: _,
|
||||
search_cutoff: Setting::NotSet,
|
||||
localized_attributes_rules: Setting::NotSet,
|
||||
prefix_search: Setting::NotSet,
|
||||
localized_attributes_rules: Setting::NotSet, // TODO to start with
|
||||
prefix_search: Setting::NotSet, // TODO continue with this
|
||||
facet_search: Setting::NotSet,
|
||||
disable_on_numbers: Setting::NotSet,
|
||||
disable_on_numbers: Setting::NotSet, // TODO (require force reindexing of searchables)
|
||||
chat: Setting::NotSet,
|
||||
vector_store: Setting::NotSet,
|
||||
wtxn: _,
|
||||
@@ -1632,10 +1632,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
// Update index settings
|
||||
let embedding_config_updates = self.update_embedding_configs()?;
|
||||
self.update_user_defined_searchable_attributes()?;
|
||||
self.update_exact_attributes()?;
|
||||
|
||||
let mut new_inner_settings =
|
||||
InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
|
||||
new_inner_settings.recompute_searchables(self.wtxn, self.index)?;
|
||||
// Note that we don't need to update the searchables here,
|
||||
// as it will be done after the settings update.
|
||||
let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
|
||||
|
||||
let primary_key_id = self
|
||||
.index
|
||||
@@ -2062,9 +2063,12 @@ impl InnerIndexSettings {
|
||||
let sortable_fields = index.sortable_fields(rtxn)?;
|
||||
let asc_desc_fields = index.asc_desc_fields(rtxn)?;
|
||||
let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string());
|
||||
let user_defined_searchable_attributes = index
|
||||
.user_defined_searchable_fields(rtxn)?
|
||||
.map(|fields| fields.into_iter().map(|f| f.to_string()).collect());
|
||||
let user_defined_searchable_attributes = match index.user_defined_searchable_fields(rtxn)? {
|
||||
Some(fields) if fields.contains(&"*") => None,
|
||||
Some(fields) => Some(fields.into_iter().map(|f| f.to_string()).collect()),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let builder = MetadataBuilder::from_index(index, rtxn)?;
|
||||
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
|
||||
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
|
||||
@@ -2578,8 +2582,17 @@ fn deserialize_sub_embedder(
|
||||
/// Implement this trait for the settings delta type.
|
||||
/// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`.
|
||||
pub trait SettingsDelta {
|
||||
fn new_embedders(&self) -> &RuntimeEmbedders;
|
||||
fn old_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
|
||||
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
|
||||
|
||||
fn old_searchable_attributes(&self) -> &Option<Vec<String>>;
|
||||
fn new_searchable_attributes(&self) -> &Option<Vec<String>>;
|
||||
|
||||
fn old_disabled_typos_terms(&self) -> &DisabledTyposTerms;
|
||||
fn new_disabled_typos_terms(&self) -> &DisabledTyposTerms;
|
||||
|
||||
fn old_embedders(&self) -> &RuntimeEmbedders;
|
||||
fn new_embedders(&self) -> &RuntimeEmbedders;
|
||||
fn new_embedder_category_id(&self) -> &HashMap<String, u8>;
|
||||
fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction>;
|
||||
fn try_for_each_fragment_diff<F, E>(
|
||||
@@ -2589,7 +2602,6 @@ pub trait SettingsDelta {
|
||||
) -> std::result::Result<(), E>
|
||||
where
|
||||
F: FnMut(FragmentDiff) -> std::result::Result<(), E>;
|
||||
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
|
||||
}
|
||||
|
||||
pub struct FragmentDiff<'a> {
|
||||
@@ -2598,26 +2610,40 @@ pub struct FragmentDiff<'a> {
|
||||
}
|
||||
|
||||
impl SettingsDelta for InnerIndexSettingsDiff {
|
||||
fn new_embedders(&self) -> &RuntimeEmbedders {
|
||||
&self.new.runtime_embedders
|
||||
fn old_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
|
||||
&self.old.fields_ids_map
|
||||
}
|
||||
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
|
||||
&self.new.fields_ids_map
|
||||
}
|
||||
|
||||
fn old_searchable_attributes(&self) -> &Option<Vec<String>> {
|
||||
&self.old.user_defined_searchable_attributes
|
||||
}
|
||||
fn new_searchable_attributes(&self) -> &Option<Vec<String>> {
|
||||
&self.new.user_defined_searchable_attributes
|
||||
}
|
||||
|
||||
fn old_disabled_typos_terms(&self) -> &DisabledTyposTerms {
|
||||
&self.old.disabled_typos_terms
|
||||
}
|
||||
fn new_disabled_typos_terms(&self) -> &DisabledTyposTerms {
|
||||
&self.new.disabled_typos_terms
|
||||
}
|
||||
|
||||
fn old_embedders(&self) -> &RuntimeEmbedders {
|
||||
&self.old.runtime_embedders
|
||||
}
|
||||
fn new_embedders(&self) -> &RuntimeEmbedders {
|
||||
&self.new.runtime_embedders
|
||||
}
|
||||
|
||||
fn new_embedder_category_id(&self) -> &HashMap<String, u8> {
|
||||
&self.new.embedder_category_id
|
||||
}
|
||||
|
||||
fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction> {
|
||||
&self.embedding_config_updates
|
||||
}
|
||||
|
||||
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
|
||||
&self.new.fields_ids_map
|
||||
}
|
||||
|
||||
fn try_for_each_fragment_diff<F, E>(
|
||||
&self,
|
||||
embedder_name: &str,
|
||||
|
||||
@@ -14,28 +14,21 @@ fn set_and_reset_searchable_fields() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
// First we send 3 documents with ids from 1 to 3.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 1, "name": "kevin", "age": 23 },
|
||||
{ "id": 2, "name": "kevina", "age": 21},
|
||||
{ "id": 3, "name": "benoit", "age": 34 }
|
||||
]),
|
||||
)
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "kevin", "age": 23 },
|
||||
{ "id": 2, "name": "kevina", "age": 21},
|
||||
{ "id": 3, "name": "benoit", "age": 34 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// We change the searchable fields to be the "name" field only.
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["name".into()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 id |
|
||||
1 name |
|
||||
|
||||
@@ -112,13 +112,12 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> {
|
||||
rendered: I,
|
||||
unused_vectors_distribution: &C::ErrorMetadata,
|
||||
) -> Result<()> {
|
||||
if self.inputs.len() < self.inputs.capacity() {
|
||||
self.inputs.push(rendered);
|
||||
self.metadata.push(metadata);
|
||||
return Ok(());
|
||||
if self.inputs.len() >= self.inputs.capacity() {
|
||||
self.embed_chunks(unused_vectors_distribution)?;
|
||||
}
|
||||
|
||||
self.embed_chunks(unused_vectors_distribution)
|
||||
self.inputs.push(rendered);
|
||||
self.metadata.push(metadata);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn drain(mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result<C> {
|
||||
|
||||
Reference in New Issue
Block a user