mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-29 09:15:38 +00:00
Compare commits
21 Commits
prototype-
...
prototype-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd01613a63 | ||
|
|
70d975b399 | ||
|
|
a8e6d946a7 | ||
|
|
7c1f72ae33 | ||
|
|
442a8f44c6 | ||
|
|
185a238c77 | ||
|
|
a82bf776f3 | ||
|
|
b2f86df127 | ||
|
|
c3a5f51705 | ||
|
|
686d1f4c12 | ||
|
|
ba75606731 | ||
|
|
baf3b036d9 | ||
|
|
0d499f0055 | ||
|
|
7999c397c5 | ||
|
|
c44db8b4bc | ||
|
|
9466949e34 | ||
|
|
f051bbfd84 | ||
|
|
72b1c3df08 | ||
|
|
01d2ee5cc1 | ||
|
|
e0c4682758 | ||
|
|
d9b4b39922 |
64
Cargo.lock
generated
64
Cargo.lock
generated
@@ -1207,6 +1207,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "doc-comment"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
|
||||
|
||||
[[package]]
|
||||
name = "dump"
|
||||
version = "1.2.0"
|
||||
@@ -1763,6 +1769,15 @@ dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
|
||||
dependencies = [
|
||||
"ahash 0.7.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
@@ -1864,6 +1879,22 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hnsw"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b9740ebf8769ec4ad6762cc951ba18f39bba6dfbc2fbbe46285f7539af79752"
|
||||
dependencies = [
|
||||
"ahash 0.7.6",
|
||||
"hashbrown 0.11.2",
|
||||
"libm",
|
||||
"num-traits",
|
||||
"rand_core",
|
||||
"serde",
|
||||
"smallvec",
|
||||
"space",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.9"
|
||||
@@ -1994,7 +2025,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"hashbrown",
|
||||
"hashbrown 0.12.3",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -2086,7 +2117,7 @@ checksum = "37228e06c75842d1097432d94d02f37fe3ebfca9791c2e8fef6e9db17ed128c1"
|
||||
dependencies = [
|
||||
"cedarwood",
|
||||
"fxhash",
|
||||
"hashbrown",
|
||||
"hashbrown 0.12.3",
|
||||
"lazy_static",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
@@ -2715,6 +2746,7 @@ dependencies = [
|
||||
"bimap",
|
||||
"bincode",
|
||||
"bstr",
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"charabia",
|
||||
"concat-arrays",
|
||||
@@ -2730,6 +2762,7 @@ dependencies = [
|
||||
"geoutils",
|
||||
"grenad",
|
||||
"heed",
|
||||
"hnsw",
|
||||
"insta",
|
||||
"itertools",
|
||||
"json-depth-checker",
|
||||
@@ -2744,6 +2777,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"ordered-float",
|
||||
"rand",
|
||||
"rand_pcg",
|
||||
"rayon",
|
||||
"roaring",
|
||||
"rstar",
|
||||
@@ -2753,6 +2787,7 @@ dependencies = [
|
||||
"smallstr",
|
||||
"smallvec",
|
||||
"smartstring",
|
||||
"space",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"time",
|
||||
@@ -3327,6 +3362,16 @@ dependencies = [
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_pcg"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
@@ -3764,6 +3809,9 @@ name = "smallvec"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smartstring"
|
||||
@@ -3786,6 +3834,16 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "space"
|
||||
version = "0.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c5ab9701ae895386d13db622abf411989deff7109b13b46b6173bb4ce5c1d123"
|
||||
dependencies = [
|
||||
"doc-comment",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spin"
|
||||
version = "0.5.2"
|
||||
@@ -4433,7 +4491,7 @@ version = "0.16.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
|
||||
dependencies = [
|
||||
"hashbrown",
|
||||
"hashbrown 0.12.3",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
|
||||
@@ -217,6 +217,7 @@ InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
|
||||
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ;
|
||||
@@ -224,7 +225,6 @@ InvalidIndexLimit , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidIndexOffset , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidIndexUid , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidAttributesToSearchOn , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
|
||||
@@ -331,11 +331,9 @@ impl ErrorCode for milli::Error {
|
||||
UserError::SortRankingRuleMissing => Code::InvalidSearchSort,
|
||||
UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets,
|
||||
UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort,
|
||||
UserError::InvalidSearchableAttribute { .. } => {
|
||||
Code::InvalidAttributesToSearchOn
|
||||
}
|
||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
||||
UserError::SortError(_) => Code::InvalidSearchSort,
|
||||
UserError::InvalidMinTypoWordLenSetting(_, _) => {
|
||||
Code::InvalidSettingsTypoTolerance
|
||||
|
||||
@@ -34,6 +34,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
pub struct SearchQueryGet {
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchQ>)]
|
||||
q: Option<String>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchQ>)]
|
||||
vector: Option<Vec<f32>>,
|
||||
#[deserr(default = Param(DEFAULT_SEARCH_OFFSET()), error = DeserrQueryParamError<InvalidSearchOffset>)]
|
||||
offset: Param<usize>,
|
||||
#[deserr(default = Param(DEFAULT_SEARCH_LIMIT()), error = DeserrQueryParamError<InvalidSearchLimit>)]
|
||||
@@ -66,8 +68,6 @@ pub struct SearchQueryGet {
|
||||
crop_marker: String,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchMatchingStrategy>)]
|
||||
matching_strategy: MatchingStrategy,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidAttributesToSearchOn>)]
|
||||
pub attributes_to_search_on: Option<CS<String>>,
|
||||
}
|
||||
|
||||
impl From<SearchQueryGet> for SearchQuery {
|
||||
@@ -82,6 +82,7 @@ impl From<SearchQueryGet> for SearchQuery {
|
||||
|
||||
Self {
|
||||
q: other.q,
|
||||
vector: other.vector,
|
||||
offset: other.offset.0,
|
||||
limit: other.limit.0,
|
||||
page: other.page.as_deref().copied(),
|
||||
@@ -98,7 +99,6 @@ impl From<SearchQueryGet> for SearchQuery {
|
||||
highlight_post_tag: other.highlight_post_tag,
|
||||
crop_marker: other.crop_marker,
|
||||
matching_strategy: other.matching_strategy,
|
||||
attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,11 +31,13 @@ pub const DEFAULT_CROP_MARKER: fn() -> String = || "…".to_string();
|
||||
pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
|
||||
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, Deserr)]
|
||||
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct SearchQuery {
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
|
||||
pub q: Option<String>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
|
||||
pub vector: Option<Vec<f32>>,
|
||||
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
|
||||
pub offset: usize,
|
||||
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
|
||||
@@ -68,8 +70,6 @@ pub struct SearchQuery {
|
||||
pub crop_marker: String,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchMatchingStrategy>, default)]
|
||||
pub matching_strategy: MatchingStrategy,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidAttributesToSearchOn>, default)]
|
||||
pub attributes_to_search_on: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
impl SearchQuery {
|
||||
@@ -82,13 +82,15 @@ impl SearchQuery {
|
||||
// This struct contains the fields of `SearchQuery` inline.
|
||||
// This is because neither deserr nor serde support `flatten` when using `deny_unknown_fields.
|
||||
// The `From<SearchQueryWithIndex>` implementation ensures both structs remain up to date.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserr)]
|
||||
#[derive(Debug, Clone, PartialEq, Deserr)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct SearchQueryWithIndex {
|
||||
#[deserr(error = DeserrJsonError<InvalidIndexUid>, missing_field_error = DeserrJsonError::missing_index_uid)]
|
||||
pub index_uid: IndexUid,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
|
||||
pub q: Option<String>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
|
||||
pub vector: Option<Vec<f32>>,
|
||||
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
|
||||
pub offset: usize,
|
||||
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
|
||||
@@ -121,8 +123,6 @@ pub struct SearchQueryWithIndex {
|
||||
pub crop_marker: String,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchMatchingStrategy>, default)]
|
||||
pub matching_strategy: MatchingStrategy,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidAttributesToSearchOn>, default)]
|
||||
pub attributes_to_search_on: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
impl SearchQueryWithIndex {
|
||||
@@ -130,6 +130,7 @@ impl SearchQueryWithIndex {
|
||||
let SearchQueryWithIndex {
|
||||
index_uid,
|
||||
q,
|
||||
vector,
|
||||
offset,
|
||||
limit,
|
||||
page,
|
||||
@@ -146,12 +147,12 @@ impl SearchQueryWithIndex {
|
||||
highlight_post_tag,
|
||||
crop_marker,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
} = self;
|
||||
(
|
||||
index_uid,
|
||||
SearchQuery {
|
||||
q,
|
||||
vector,
|
||||
offset,
|
||||
limit,
|
||||
page,
|
||||
@@ -168,7 +169,6 @@ impl SearchQueryWithIndex {
|
||||
highlight_post_tag,
|
||||
crop_marker,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
// do not use ..Default::default() here,
|
||||
// rather add any missing field from `SearchQuery` to `SearchQueryWithIndex`
|
||||
},
|
||||
@@ -276,12 +276,12 @@ pub fn perform_search(
|
||||
|
||||
let mut search = index.search(&rtxn);
|
||||
|
||||
if let Some(ref query) = query.q {
|
||||
search.query(query);
|
||||
if let Some(ref vector) = query.vector {
|
||||
search.vector(vector.clone());
|
||||
}
|
||||
|
||||
if let Some(ref searchable) = query.attributes_to_search_on {
|
||||
search.searchable_attributes(searchable);
|
||||
if let Some(ref query) = query.q {
|
||||
search.query(query);
|
||||
}
|
||||
|
||||
let is_finite_pagination = query.is_finite_pagination();
|
||||
|
||||
@@ -963,27 +963,3 @@ async fn sort_unset_ranking_rule() {
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn search_on_unknown_field() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(0).await;
|
||||
|
||||
index
|
||||
.search(
|
||||
json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}),
|
||||
|response, code| {
|
||||
assert_eq!(400, code, "{}", response);
|
||||
assert_eq!(response, json!({
|
||||
"message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.",
|
||||
"code": "invalid_attributes_to_search_on",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_attributes_to_search_on"
|
||||
}));
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ mod errors;
|
||||
mod formatted;
|
||||
mod multi;
|
||||
mod pagination;
|
||||
mod restrict_searchable;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
@@ -1,241 +0,0 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::index::Index;
|
||||
use crate::common::Server;
|
||||
|
||||
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
|
||||
let index = server.index("test");
|
||||
|
||||
index.add_documents(documents.clone(), None).await;
|
||||
index.wait_task(0).await;
|
||||
index
|
||||
}
|
||||
|
||||
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"title": "Captain Planet",
|
||||
"desc": "He's not part of the Marvel Cinematic Universe",
|
||||
"id": "2",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "3",
|
||||
}])
|
||||
});
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search_on_title() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
// simple search should return 2 documents (ids: 2 and 3).
|
||||
index
|
||||
.search(
|
||||
json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"]}),
|
||||
|response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(response["hits"].as_array().unwrap().len(), 2);
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_prefix_search_on_title() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
// simple search should return 2 documents (ids: 2 and 3).
|
||||
index
|
||||
.search(json!({"q": "Captain Mar", "attributesToSearchOn": ["title"]}), |response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(response["hits"].as_array().unwrap().len(), 2);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search_on_title_matching_strategy_all() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
// simple search matching strategy all should only return 1 document (ids: 2).
|
||||
index
|
||||
.search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "matchingStrategy": "all"}), |response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search_on_no_field() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
// simple search on no field shouldn't return any document.
|
||||
index
|
||||
.search(json!({"q": "Captain Marvel", "attributesToSearchOn": []}), |response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(response["hits"].as_array().unwrap().len(), 0);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn word_ranking_rule_order() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
// Document 3 should appear before document 2.
|
||||
index
|
||||
.search(
|
||||
json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["id"]}),
|
||||
|response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(
|
||||
response["hits"],
|
||||
json!([
|
||||
{"id": "3"},
|
||||
{"id": "2"},
|
||||
])
|
||||
);
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn word_ranking_rule_order_exact_words() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
index.update_settings_typo_tolerance(json!({"disableOnWords": ["Captain", "Marvel"]})).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
// simple search should return 2 documents (ids: 2 and 3).
|
||||
index
|
||||
.search(
|
||||
json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["id"]}),
|
||||
|response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(
|
||||
response["hits"],
|
||||
json!([
|
||||
{"id": "3"},
|
||||
{"id": "2"},
|
||||
])
|
||||
);
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn typo_ranking_rule_order() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(
|
||||
&server,
|
||||
&json!([
|
||||
{
|
||||
"title": "Capitain Marivel",
|
||||
"desc": "Captain Marvel",
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marivel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "2",
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Document 2 should appear before document 1.
|
||||
index
|
||||
.search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["id"]}), |response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(
|
||||
response["hits"],
|
||||
json!([
|
||||
{"id": "2"},
|
||||
{"id": "1"},
|
||||
])
|
||||
);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn attributes_ranking_rule_order() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(
|
||||
&server,
|
||||
&json!([
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"footer": "The story of Captain Marvel",
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"title": "The Avengers",
|
||||
"desc": "Captain Marvel is far from the earth",
|
||||
"footer": "A super hero team",
|
||||
"id": "2",
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Document 2 should appear before document 1.
|
||||
index
|
||||
.search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["desc", "footer"], "attributesToRetrieve": ["id"]}), |response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(
|
||||
response["hits"],
|
||||
json!([
|
||||
{"id": "2"},
|
||||
{"id": "1"},
|
||||
])
|
||||
);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn exactness_ranking_rule_order() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(
|
||||
&server,
|
||||
&json!([
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "Captain Marivel",
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "CaptainMarvel",
|
||||
"id": "2",
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Document 2 should appear before document 1.
|
||||
index
|
||||
.search(json!({"q": "Captain Marvel", "attributesToRetrieve": ["id"], "attributesToSearchOn": ["desc"]}), |response, code| {
|
||||
assert_eq!(200, code, "{}", response);
|
||||
assert_eq!(
|
||||
response["hits"],
|
||||
json!([
|
||||
{"id": "2"},
|
||||
{"id": "1"},
|
||||
])
|
||||
);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
@@ -15,6 +15,7 @@ license.workspace = true
|
||||
bimap = { version = "0.6.3", features = ["serde"] }
|
||||
bincode = "1.3.3"
|
||||
bstr = "1.4.0"
|
||||
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.4.3"
|
||||
charabia = { version = "0.7.2", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
@@ -32,18 +33,21 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f
|
||||
"lmdb",
|
||||
"sync-read-txn",
|
||||
] }
|
||||
hnsw = { version = "0.11.0", features = ["serde1"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
memmap2 = "0.5.10"
|
||||
obkv = "0.2.0"
|
||||
once_cell = "1.17.1"
|
||||
ordered-float = "3.6.0"
|
||||
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
||||
rayon = "1.7.0"
|
||||
roaring = "0.10.1"
|
||||
rstar = { version = "0.10.0", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
slice-group-by = "0.3.0"
|
||||
space = "0.17.0"
|
||||
smallstr = { version = "0.3.0", features = ["serde"] }
|
||||
smallvec = "1.10.0"
|
||||
smartstring = "1.0.1"
|
||||
|
||||
@@ -52,6 +52,7 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||
let docs = execute_search(
|
||||
&mut ctx,
|
||||
&(!query.trim().is_empty()).then(|| query.trim().to_owned()),
|
||||
&None,
|
||||
TermsMatchingStrategy::Last,
|
||||
false,
|
||||
&None,
|
||||
|
||||
34
milli/src/distance.rs
Normal file
34
milli/src/distance.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use space::Metric;
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct DotProduct;
|
||||
|
||||
impl Metric<Vec<f32>> for DotProduct {
|
||||
type Unit = u32;
|
||||
|
||||
// TODO explain me this function, I don't understand why f32.to_bits is ordered.
|
||||
// I tried to do this and it wasn't OK <https://stackoverflow.com/a/43305015/1941280>
|
||||
//
|
||||
// Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
|
||||
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
|
||||
let dist: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
|
||||
let dist = 1.0 - dist;
|
||||
debug_assert!(!dist.is_nan());
|
||||
dist.to_bits()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct Euclidean;
|
||||
|
||||
impl Metric<Vec<f32>> for Euclidean {
|
||||
type Unit = u32;
|
||||
|
||||
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
|
||||
let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum();
|
||||
let dist = squared.sqrt();
|
||||
debug_assert!(!dist.is_nan());
|
||||
dist.to_bits()
|
||||
}
|
||||
}
|
||||
@@ -110,9 +110,11 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
},
|
||||
#[error(transparent)]
|
||||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("{0}")]
|
||||
InvalidFilter(String),
|
||||
#[error("Invalid type for filter subexpression: `expected {}, found: {1}`.", .0.join(", "))]
|
||||
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
||||
InvalidFilterExpression(&'static [&'static str], Value),
|
||||
#[error("Attribute `{}` is not sortable. {}",
|
||||
.field,
|
||||
@@ -124,16 +126,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
}
|
||||
)]
|
||||
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
|
||||
#[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.",
|
||||
.field,
|
||||
.valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
||||
)]
|
||||
InvalidSearchableAttribute {
|
||||
field: String,
|
||||
valid_fields: BTreeSet<String>,
|
||||
hidden_fields: bool,
|
||||
},
|
||||
#[error("{}", HeedError::BadOpenOptions)]
|
||||
InvalidLmdbOpenOptions,
|
||||
#[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")]
|
||||
|
||||
@@ -23,9 +23,3 @@ pub use self::roaring_bitmap_length::{
|
||||
pub use self::script_language_codec::ScriptLanguageCodec;
|
||||
pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec};
|
||||
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
pub trait BytesDecodeOwned {
|
||||
type DItem;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem>;
|
||||
}
|
||||
|
||||
@@ -2,11 +2,8 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::mem::size_of;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct BoRoaringBitmapCodec;
|
||||
|
||||
impl BoRoaringBitmapCodec {
|
||||
@@ -16,7 +13,7 @@ impl BoRoaringBitmapCodec {
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecode<'_> for BoRoaringBitmapCodec {
|
||||
impl heed::BytesDecode<'_> for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
@@ -31,14 +28,6 @@ impl BytesDecode<'_> for BoRoaringBitmapCodec {
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
|
||||
@@ -5,8 +5,6 @@ use std::mem::size_of;
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
/// This is the limit where using a byteorder became less size efficient
|
||||
/// than using a direct roaring encoding, it is also the point where we are able
|
||||
/// to determine the encoding used only by using the array of bytes length.
|
||||
@@ -105,14 +103,6 @@ impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for CboRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Self::deserialize_from(bytes).ok()
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
|
||||
@@ -2,8 +2,6 @@ use std::borrow::Cow;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct RoaringBitmapCodec;
|
||||
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapCodec {
|
||||
@@ -14,14 +12,6 @@ impl heed::BytesDecode<'_> for RoaringBitmapCodec {
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
RoaringBitmap::deserialize_from(bytes).ok()
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for RoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
|
||||
@@ -1,23 +1,11 @@
|
||||
use std::mem;
|
||||
|
||||
use heed::BytesDecode;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct BoRoaringBitmapLenCodec;
|
||||
|
||||
impl BytesDecode<'_> for BoRoaringBitmapLenCodec {
|
||||
impl heed::BytesDecode<'_> for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Some((bytes.len() / mem::size_of::<u32>()) as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,14 +1,11 @@
|
||||
use std::mem;
|
||||
|
||||
use heed::BytesDecode;
|
||||
|
||||
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct CboRoaringBitmapLenCodec;
|
||||
|
||||
impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
@@ -23,11 +20,3 @@ impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,8 +3,6 @@ use std::mem;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
|
||||
const SERIAL_COOKIE: u16 = 12347;
|
||||
|
||||
@@ -61,14 +59,6 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::BytesEncode;
|
||||
|
||||
@@ -8,10 +8,12 @@ use charabia::{Language, Script};
|
||||
use heed::flags::Flags;
|
||||
use heed::types::*;
|
||||
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
|
||||
use rand_pcg::Pcg32;
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::distance::DotProduct;
|
||||
use crate::error::{InternalError, UserError};
|
||||
use crate::facet::FacetType;
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
@@ -26,6 +28,9 @@ use crate::{
|
||||
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
|
||||
};
|
||||
|
||||
/// The HNSW data-structure that we serialize, fill and search in.
|
||||
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>;
|
||||
|
||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
|
||||
|
||||
@@ -42,6 +47,7 @@ pub mod main_key {
|
||||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
|
||||
pub const GEO_RTREE_KEY: &str = "geo-rtree";
|
||||
pub const VECTOR_HNSW_KEY: &str = "vector-hnsw";
|
||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
|
||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||
@@ -86,6 +92,7 @@ pub mod db_name {
|
||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
||||
pub const DOCUMENTS: &str = "documents";
|
||||
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
|
||||
}
|
||||
@@ -149,6 +156,9 @@ pub struct Index {
|
||||
/// Maps the document id, the facet field id and the strings.
|
||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||
|
||||
/// Maps a vector id to the document id that have it.
|
||||
pub vector_id_docid: Database<OwnedType<BEU32>, OwnedType<BEU32>>,
|
||||
|
||||
/// Maps the document id to the document as an obkv store.
|
||||
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
||||
}
|
||||
@@ -162,7 +172,7 @@ impl Index {
|
||||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(23);
|
||||
options.max_dbs(24);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
@@ -198,11 +208,11 @@ impl Index {
|
||||
env.create_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
|
||||
let facet_id_is_empty_docids =
|
||||
env.create_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
|
||||
|
||||
let field_id_docid_facet_f64s =
|
||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
|
||||
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
||||
wtxn.commit()?;
|
||||
|
||||
@@ -231,6 +241,7 @@ impl Index {
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_id_docid,
|
||||
documents,
|
||||
})
|
||||
}
|
||||
@@ -502,6 +513,26 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/* vector HNSW */
|
||||
|
||||
/// Writes the provided `hnsw`.
|
||||
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<Hnsw>>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw)
|
||||
}
|
||||
|
||||
/// Delete the `hnsw`.
|
||||
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY)
|
||||
}
|
||||
|
||||
/// Returns the `hnsw`.
|
||||
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
|
||||
match self.main.get::<_, Str, SerdeBincode<Hnsw>>(rtxn, main_key::VECTOR_HNSW_KEY)? {
|
||||
Some(hnsw) => Ok(Some(hnsw)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/* field distribution */
|
||||
|
||||
/// Writes the field distribution which associates every field name with
|
||||
@@ -1466,9 +1497,9 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, field_distribution,
|
||||
@r###"
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
"###
|
||||
);
|
||||
|
||||
@@ -1486,9 +1517,9 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, field_distribution,
|
||||
@r###"
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
"###
|
||||
);
|
||||
|
||||
@@ -1502,9 +1533,9 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, field_distribution,
|
||||
@r###"
|
||||
has_dog 1
|
||||
id 2
|
||||
name 2
|
||||
has_dog 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
"###
|
||||
);
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ pub mod documents;
|
||||
|
||||
mod asc_desc;
|
||||
mod criterion;
|
||||
pub mod distance;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
pub mod facet;
|
||||
|
||||
@@ -22,12 +22,12 @@ pub mod new;
|
||||
|
||||
pub struct Search<'a> {
|
||||
query: Option<String>,
|
||||
vector: Option<Vec<f32>>,
|
||||
// this should be linked to the String in the query
|
||||
filter: Option<Filter<'a>>,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
sort_criteria: Option<Vec<AscDesc>>,
|
||||
searchable_attributes: Option<&'a [String]>,
|
||||
geo_strategy: new::GeoSortStrategy,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
words_limit: usize,
|
||||
@@ -40,11 +40,11 @@ impl<'a> Search<'a> {
|
||||
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
|
||||
Search {
|
||||
query: None,
|
||||
vector: None,
|
||||
filter: None,
|
||||
offset: 0,
|
||||
limit: 20,
|
||||
sort_criteria: None,
|
||||
searchable_attributes: None,
|
||||
geo_strategy: new::GeoSortStrategy::default(),
|
||||
terms_matching_strategy: TermsMatchingStrategy::default(),
|
||||
exhaustive_number_hits: false,
|
||||
@@ -59,6 +59,11 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn vector(&mut self, vector: impl Into<Vec<f32>>) -> &mut Search<'a> {
|
||||
self.vector = Some(vector.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: usize) -> &mut Search<'a> {
|
||||
self.offset = offset;
|
||||
self
|
||||
@@ -74,11 +79,6 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> {
|
||||
self.searchable_attributes = Some(searchable);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn terms_matching_strategy(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> {
|
||||
self.terms_matching_strategy = value;
|
||||
self
|
||||
@@ -109,15 +109,11 @@ impl<'a> Search<'a> {
|
||||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn);
|
||||
|
||||
if let Some(searchable_attributes) = self.searchable_attributes {
|
||||
ctx.searchable_attributes(searchable_attributes)?;
|
||||
}
|
||||
|
||||
let PartialSearchResult { located_query_terms, candidates, documents_ids } =
|
||||
execute_search(
|
||||
&mut ctx,
|
||||
&self.query,
|
||||
&self.vector,
|
||||
self.terms_matching_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
&self.filter,
|
||||
@@ -144,11 +140,11 @@ impl fmt::Debug for Search<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let Search {
|
||||
query,
|
||||
vector: _,
|
||||
filter,
|
||||
offset,
|
||||
limit,
|
||||
sort_criteria,
|
||||
searchable_attributes,
|
||||
geo_strategy: _,
|
||||
terms_matching_strategy,
|
||||
words_limit,
|
||||
@@ -158,11 +154,11 @@ impl fmt::Debug for Search<'_> {
|
||||
} = self;
|
||||
f.debug_struct("Search")
|
||||
.field("query", query)
|
||||
.field("vector", &"[...]")
|
||||
.field("filter", filter)
|
||||
.field("offset", offset)
|
||||
.field("limit", limit)
|
||||
.field("sort_criteria", sort_criteria)
|
||||
.field("searchable_attributes", searchable_attributes)
|
||||
.field("terms_matching_strategy", terms_matching_strategy)
|
||||
.field("exhaustive_number_hits", exhaustive_number_hits)
|
||||
.field("words_limit", words_limit)
|
||||
|
||||
@@ -4,13 +4,12 @@ use std::hash::Hash;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesEncode, Database, RoTxn};
|
||||
use heed::{BytesDecode, BytesEncode, Database, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::Interned;
|
||||
use super::Word;
|
||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
||||
};
|
||||
@@ -23,110 +22,50 @@ use crate::{
|
||||
#[derive(Default)]
|
||||
pub struct DatabaseCache<'ctx> {
|
||||
pub word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
||||
pub word_prefix_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
||||
pub prefix_word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
||||
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
|
||||
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
|
||||
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
}
|
||||
impl<'ctx> DatabaseCache<'ctx> {
|
||||
fn get_value<'v, K1, KC, DC>(
|
||||
fn get_value<'v, K1, KC>(
|
||||
txn: &'ctx RoTxn,
|
||||
cache_key: K1,
|
||||
db_key: &'v KC::EItem,
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
cache: &mut FxHashMap<K1, Option<&'ctx [u8]>>,
|
||||
db: Database<KC, ByteSlice>,
|
||||
) -> Result<Option<DC::DItem>>
|
||||
) -> Result<Option<&'ctx [u8]>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
DC: BytesDecodeOwned,
|
||||
{
|
||||
match cache.entry(cache_key) {
|
||||
Entry::Occupied(_) => {}
|
||||
let bitmap_ptr = match cache.entry(cache_key) {
|
||||
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
||||
Entry::Vacant(entry) => {
|
||||
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
|
||||
entry.insert(bitmap_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => {
|
||||
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
|
||||
}
|
||||
Some(Cow::Owned(bytes)) => {
|
||||
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_value_from_keys<'v, K1, KC, DC>(
|
||||
txn: &'ctx RoTxn,
|
||||
cache_key: K1,
|
||||
db_keys: &'v [KC::EItem],
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
db: Database<KC, ByteSlice>,
|
||||
merger: MergeFn,
|
||||
) -> Result<Option<DC::DItem>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
DC: BytesDecodeOwned,
|
||||
KC::EItem: Sized,
|
||||
{
|
||||
match cache.entry(cache_key) {
|
||||
Entry::Occupied(_) => {}
|
||||
Entry::Vacant(entry) => {
|
||||
let bitmap_ptr: Option<Cow<'ctx, [u8]>> = match db_keys {
|
||||
[] => None,
|
||||
[key] => db.get(txn, key)?.map(Cow::Borrowed),
|
||||
keys => {
|
||||
let bitmaps = keys
|
||||
.iter()
|
||||
.filter_map(|key| db.get(txn, key).transpose())
|
||||
.map(|v| v.map(Cow::Borrowed))
|
||||
.collect::<std::result::Result<Vec<Cow<[u8]>>, _>>()?;
|
||||
|
||||
if bitmaps.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(merger(&[], &bitmaps[..])?)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let bitmap_ptr = db.get(txn, db_key)?;
|
||||
entry.insert(bitmap_ptr);
|
||||
bitmap_ptr
|
||||
}
|
||||
};
|
||||
|
||||
match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => {
|
||||
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
|
||||
}
|
||||
Some(Cow::Owned(bytes)) => {
|
||||
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
Ok(bitmap_ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
|
||||
if let Some(fst) = self.db_cache.words_fst.clone() {
|
||||
@@ -160,41 +99,30 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
|
||||
/// Retrieve or insert the given value in the `word_docids` database.
|
||||
fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(word).as_str();
|
||||
let keys: Vec<_> = restricted_fids.iter().map(|fid| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
||||
),
|
||||
}
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
fn get_db_exact_word_docids(
|
||||
&mut self,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.exact_word_docids,
|
||||
self.index.exact_word_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
|
||||
@@ -222,41 +150,30 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(prefix).as_str();
|
||||
let keys: Vec<_> = restricted_fids.iter().map(|fid| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||
),
|
||||
}
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
fn get_db_exact_word_prefix_docids(
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.exact_word_prefix_docids,
|
||||
self.index.exact_word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids(
|
||||
@@ -265,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
@@ -275,7 +192,9 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids_len(
|
||||
@@ -284,7 +203,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<u64>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
@@ -294,7 +213,11 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| {
|
||||
CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())
|
||||
})
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_pair_proximity_docids(
|
||||
@@ -303,7 +226,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
prefix2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, prefix2),
|
||||
&(
|
||||
@@ -313,7 +236,9 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
),
|
||||
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
@@ -321,7 +246,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, left_prefix, right),
|
||||
&(
|
||||
@@ -331,7 +256,9 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
),
|
||||
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_fid_docids(
|
||||
@@ -339,18 +266,15 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// if the requested fid isn't in the restricted list, return None.
|
||||
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, fid),
|
||||
&(self.word_interner.get(word).as_str(), fid),
|
||||
&mut self.db_cache.word_fid_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_fid_docids(
|
||||
@@ -358,18 +282,15 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word_prefix: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// if the requested fid isn't in the restricted list, return None.
|
||||
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, fid),
|
||||
&(self.word_interner.get(word_prefix).as_str(), fid),
|
||||
&mut self.db_cache.word_prefix_fid_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
@@ -388,7 +309,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache.word_fid_docids.insert((word, fid), Some(Cow::Borrowed(value)));
|
||||
self.db_cache.word_fid_docids.insert((word, fid), Some(value));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
@@ -414,9 +335,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_fid_docids
|
||||
.insert((word_prefix, fid), Some(Cow::Borrowed(value)));
|
||||
self.db_cache.word_prefix_fid_docids.insert((word_prefix, fid), Some(value));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
@@ -431,13 +350,15 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, position),
|
||||
&(self.word_interner.get(word).as_str(), position),
|
||||
&mut self.db_cache.word_position_docids,
|
||||
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_position_docids(
|
||||
@@ -445,13 +366,15 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word_prefix: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, position),
|
||||
&(self.word_interner.get(word_prefix).as_str(), position),
|
||||
&mut self.db_cache.word_prefix_position_docids,
|
||||
self.index.word_prefix_position_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
@@ -470,9 +393,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_position_docids
|
||||
.insert((word, position), Some(Cow::Borrowed(value)));
|
||||
self.db_cache.word_position_docids.insert((word, position), Some(value));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
@@ -503,7 +424,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_position_docids
|
||||
.insert((word_prefix, position), Some(Cow::Borrowed(value)));
|
||||
.insert((word_prefix, position), Some(value));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
|
||||
@@ -509,6 +509,7 @@ mod tests {
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
&mut ctx,
|
||||
&Some(query.to_string()),
|
||||
&None,
|
||||
crate::TermsMatchingStrategy::default(),
|
||||
false,
|
||||
&None,
|
||||
|
||||
@@ -20,7 +20,7 @@ mod sort;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::collections::HashSet;
|
||||
|
||||
use bucket_sort::{bucket_sort, BucketSortOutput};
|
||||
use charabia::TokenizerBuilder;
|
||||
@@ -28,6 +28,7 @@ use db_cache::DatabaseCache;
|
||||
use exact_attribute::ExactAttribute;
|
||||
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
|
||||
use heed::RoTxn;
|
||||
use hnsw::Searcher;
|
||||
use interner::{DedupInterner, Interner};
|
||||
pub use logger::visual::VisualSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
@@ -39,14 +40,16 @@ use ranking_rules::{
|
||||
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
|
||||
use roaring::RoaringBitmap;
|
||||
use sort::Sort;
|
||||
use space::Neighbor;
|
||||
|
||||
use self::geo_sort::GeoSort;
|
||||
pub use self::geo_sort::Strategy as GeoSortStrategy;
|
||||
use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use crate::error::FieldIdMapMissingEntry;
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
|
||||
use crate::{
|
||||
AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
|
||||
};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
pub struct SearchContext<'ctx> {
|
||||
@@ -57,7 +60,6 @@ pub struct SearchContext<'ctx> {
|
||||
pub phrase_interner: DedupInterner<Phrase>,
|
||||
pub term_interner: Interner<QueryTerm>,
|
||||
pub phrase_docids: PhraseDocIdsCache,
|
||||
pub restricted_fids: Option<Vec<u16>>,
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
@@ -70,66 +72,8 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
phrase_interner: <_>::default(),
|
||||
term_interner: <_>::default(),
|
||||
phrase_docids: <_>::default(),
|
||||
restricted_fids: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> {
|
||||
let fids_map = self.index.fields_ids_map(self.txn)?;
|
||||
let searchable_names = self.index.searchable_fields(self.txn)?;
|
||||
|
||||
let mut restricted_fids = Vec::new();
|
||||
for field_name in searchable_attributes {
|
||||
let searchable_contains_name =
|
||||
searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name));
|
||||
let fid = match (fids_map.id(field_name), searchable_contains_name) {
|
||||
// The Field id exist and the field is searchable
|
||||
(Some(fid), Some(true)) | (Some(fid), None) => fid,
|
||||
// The field is searchable but the Field id doesn't exist => Internal Error
|
||||
(None, Some(true)) => {
|
||||
return Err(FieldIdMapMissingEntry::FieldName {
|
||||
field_name: field_name.to_string(),
|
||||
process: "search",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
// The field is not searchable => User error
|
||||
_otherwise => {
|
||||
let mut valid_fields: BTreeSet<_> =
|
||||
fids_map.names().map(String::from).collect();
|
||||
|
||||
// Filter by the searchable names
|
||||
if let Some(sn) = searchable_names {
|
||||
let searchable_names = sn.iter().map(|s| s.to_string()).collect();
|
||||
valid_fields = &valid_fields & &searchable_names;
|
||||
}
|
||||
|
||||
let searchable_count = valid_fields.len();
|
||||
|
||||
// Remove hidden fields
|
||||
if let Some(dn) = self.index.displayed_fields(self.txn)? {
|
||||
let displayable_names = dn.iter().map(|s| s.to_string()).collect();
|
||||
valid_fields = &valid_fields & &displayable_names;
|
||||
}
|
||||
|
||||
let hidden_fields = searchable_count > valid_fields.len();
|
||||
let field = field_name.to_string();
|
||||
return Err(UserError::InvalidSearchableAttribute {
|
||||
field,
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
};
|
||||
|
||||
restricted_fids.push(fid);
|
||||
}
|
||||
|
||||
self.restricted_fids = Some(restricted_fids);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
|
||||
@@ -409,6 +353,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext,
|
||||
query: &Option<String>,
|
||||
vector: &Option<Vec<f32>>,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
exhaustive_number_hits: bool,
|
||||
filters: &Option<Filter>,
|
||||
@@ -488,6 +433,33 @@ pub fn execute_search(
|
||||
|
||||
let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output;
|
||||
|
||||
let docids = match vector {
|
||||
Some(vector) => {
|
||||
// return the nearest documents that are also part of the candidates.
|
||||
let mut searcher = Searcher::new();
|
||||
let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default();
|
||||
let ef = hnsw.len().min(100);
|
||||
let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef];
|
||||
let neighbors = hnsw.nearest(vector, ef, &mut searcher, &mut dest[..]);
|
||||
|
||||
let mut docids = Vec::new();
|
||||
for Neighbor { index, distance: _ } in neighbors.iter() {
|
||||
let index = BEU32::new(*index as u32);
|
||||
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get();
|
||||
if universe.contains(docid) {
|
||||
docids.push(docid);
|
||||
if docids.len() == (from + length) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docids.into_iter().skip(from).take(length).collect()
|
||||
}
|
||||
// return the search docids if the vector field is not specified
|
||||
None => docids,
|
||||
};
|
||||
|
||||
// The candidates is the universe unless the exhaustive number of hits
|
||||
// is requested and a distinct attribute is set.
|
||||
if exhaustive_number_hits {
|
||||
|
||||
@@ -318,7 +318,7 @@ pub fn snap_field_distributions(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut snap = String::new();
|
||||
for (field, count) in index.field_distribution(&rtxn).unwrap() {
|
||||
writeln!(&mut snap, "{field:<16} {count:<6}").unwrap();
|
||||
writeln!(&mut snap, "{field:<16} {count:<6} |").unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
@@ -328,7 +328,7 @@ pub fn snap_fields_ids_map(index: &Index) -> String {
|
||||
let mut snap = String::new();
|
||||
for field_id in fields_ids_map.ids() {
|
||||
let name = fields_ids_map.name(field_id).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {name:<16} |").unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
age 1 |
|
||||
id 2 |
|
||||
name 2 |
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_id_docid,
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
@@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||
|
||||
// We clean all the faceted documents ids.
|
||||
for field_id in faceted_fields {
|
||||
@@ -95,6 +97,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
vector_id_docid.clear(self.wtxn)?;
|
||||
documents.clear(self.wtxn)?;
|
||||
|
||||
Ok(number_of_documents)
|
||||
|
||||
@@ -4,8 +4,10 @@ use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
use fst::IntoStreamer;
|
||||
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
|
||||
use heed::{BytesDecode, BytesEncode, Database, RwIter};
|
||||
use hnsw::Searcher;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use space::KnnPoints;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::facet::delete::FacetsDelete;
|
||||
@@ -14,6 +16,7 @@ use crate::error::InternalError;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::index::Hnsw;
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
|
||||
};
|
||||
@@ -247,6 +250,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
vector_id_docid,
|
||||
documents,
|
||||
} = self.index;
|
||||
// Remove from the documents database
|
||||
@@ -436,6 +440,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
// An ugly and slow way to remove the vectors from the HNSW
|
||||
// It basically reconstructs the HNSW from scratch without editing the current one.
|
||||
let current_hnsw = self.index.vector_hnsw(self.wtxn)?.unwrap_or_default();
|
||||
if !current_hnsw.is_empty() {
|
||||
let mut new_hnsw = Hnsw::default();
|
||||
let mut searcher = Searcher::new();
|
||||
let mut new_vector_id_docids = Vec::new();
|
||||
|
||||
for result in vector_id_docid.iter(self.wtxn)? {
|
||||
let (vector_id, docid) = result?;
|
||||
if !self.to_delete_docids.contains(docid.get()) {
|
||||
let vector = current_hnsw.get_point(vector_id.get() as usize).clone();
|
||||
let vector_id = new_hnsw.insert(vector, &mut searcher);
|
||||
new_vector_id_docids.push((vector_id as u32, docid));
|
||||
}
|
||||
}
|
||||
|
||||
vector_id_docid.clear(self.wtxn)?;
|
||||
for (vector_id, docid) in new_vector_id_docids {
|
||||
vector_id_docid.put(self.wtxn, &BEU32::new(vector_id), &docid)?;
|
||||
}
|
||||
self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?;
|
||||
}
|
||||
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
|
||||
|
||||
Ok(DetailedDocumentDeletionResult {
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use serde_json::from_slice;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vector` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
#[logging_timer::time]
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
vector_fid: FieldId,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
|
||||
// first we get the _vector field
|
||||
if let Some(vector) = obkv.get(vector_fid) {
|
||||
// try to extract the vector
|
||||
let vector: Vec<f32> = from_slice(vector).map_err(InternalError::SerdeJson).unwrap();
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
}
|
||||
// else => the _vector object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
@@ -4,6 +4,7 @@ mod extract_facet_string_docids;
|
||||
mod extract_fid_docid_facet_values;
|
||||
mod extract_fid_word_count_docids;
|
||||
mod extract_geo_points;
|
||||
mod extract_vector_points;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_fid_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
@@ -22,6 +23,7 @@ use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::extract_vector_points;
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_fid_docids::extract_word_fid_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
@@ -45,6 +47,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
faceted_fields: HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
vector_field_id: Option<FieldId>,
|
||||
stop_words: Option<fst::Set<&[u8]>>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
exact_attributes: HashSet<FieldId>,
|
||||
@@ -69,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
vector_field_id,
|
||||
&stop_words,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
@@ -279,6 +283,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
vector_field_id: Option<FieldId>,
|
||||
stop_words: &Option<fst::Set<&[u8]>>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
@@ -307,6 +312,20 @@ fn send_and_extract_flattened_documents_data(
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(vector_field_id) = vector_field_id {
|
||||
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
|
||||
let _ = match result {
|
||||
Ok(vector_points) => {
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||
}
|
||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
|
||||
@@ -304,6 +304,8 @@ where
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
// get the fid of the `_vector` field.
|
||||
let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||
@@ -340,6 +342,7 @@ where
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
vector_field_id,
|
||||
stop_words,
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
|
||||
@@ -4,20 +4,24 @@ use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use bytemuck::allocation::pod_collect_to_vec;
|
||||
use charabia::{Language, Script};
|
||||
use grenad::MergerBuilder;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::RwTxn;
|
||||
use hnsw::Searcher;
|
||||
use roaring::RoaringBitmap;
|
||||
use space::KnnPoints;
|
||||
|
||||
use super::helpers::{
|
||||
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
||||
};
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::error::UserError;
|
||||
use crate::facet::FacetType;
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
|
||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
@@ -38,6 +42,7 @@ pub(crate) enum TypedChunk {
|
||||
FieldIdFacetIsNullDocids(grenad::Reader<File>),
|
||||
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
||||
GeoPoints(grenad::Reader<File>),
|
||||
VectorPoints(grenad::Reader<File>),
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||
}
|
||||
|
||||
@@ -221,6 +226,38 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.put_geo_rtree(wtxn, &rtree)?;
|
||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
TypedChunk::VectorPoints(vector_points) => {
|
||||
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
|
||||
let mut searcher = Searcher::new();
|
||||
|
||||
let mut expected_dimensions = match index.vector_id_docid.iter(wtxn)?.next() {
|
||||
Some(result) => {
|
||||
let (vector_id, _) = result?;
|
||||
Some(hnsw.get_point(vector_id.get() as usize).len())
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut cursor = vector_points.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||
|
||||
// TODO Move this error in the vector extractor
|
||||
let found = vector.len();
|
||||
let expected = *expected_dimensions.get_or_insert(found);
|
||||
if expected != found {
|
||||
return Err(UserError::InvalidVectorDimensions { expected, found })?;
|
||||
}
|
||||
|
||||
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
|
||||
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
|
||||
}
|
||||
log::debug!("There are {} entries in the HNSW so far", hnsw.len());
|
||||
index.put_vector_hnsw(wtxn, &hnsw)?;
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
||||
let mut buffer = Vec::new();
|
||||
for (key, value) in hash_pair {
|
||||
|
||||
@@ -4,8 +4,7 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||
pub use self::index_documents::{
|
||||
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
|
||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
|
||||
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||
};
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::prefix_word_pairs::{
|
||||
|
||||
Reference in New Issue
Block a user