never store the _vectors as searchable or faceted fields

This commit is contained in:
Tamo
2024-05-21 17:08:45 +02:00
parent 4148fbbe85
commit 7a84697570
3 changed files with 124 additions and 9 deletions

View File

@ -23,6 +23,7 @@ use crate::heed_codec::{
};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::vector::{Embedding, EmbeddingConfig};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -644,6 +645,7 @@ impl Index {
&self,
wtxn: &mut RwTxn,
user_fields: &[&str],
non_searchable_fields_ids: &[FieldId],
fields_ids_map: &FieldsIdsMap,
) -> Result<()> {
// We can write the user defined searchable fields as-is.
@ -662,6 +664,7 @@ impl Index {
for (weight, user_field) in user_fields.iter().enumerate() {
if crate::is_faceted_by(field_from_map, user_field)
&& !real_fields.contains(&field_from_map)
&& !non_searchable_fields_ids.contains(&id)
{
real_fields.push(field_from_map);
@ -708,6 +711,7 @@ impl Index {
Ok(self
.fields_ids_map(rtxn)?
.names()
.filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
.map(|field| Cow::Owned(field.to_string()))
.collect())
})
@ -1669,15 +1673,17 @@ pub(crate) mod tests {
use big_s::S;
use heed::{EnvOpenOptions, RwTxn};
use maplit::hashset;
use maplit::{btreemap, hashset};
use tempfile::TempDir;
use crate::documents::DocumentsBatchReader;
use crate::error::{Error, InternalError};
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::{
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
Settings,
};
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
pub(crate) struct TempIndex {
@ -2783,4 +2789,95 @@ pub(crate) mod tests {
]
"###);
}
#[test]
fn vectors_are_never_indexed_as_searchable_or_filterable() {
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 0, "_vectors": { "doggo": [2345] } },
{ "id": 1, "_vectors": { "doggo": [6789] } },
]))
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @r###"["id"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
drop(rtxn);
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]);
settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]);
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
index
.update_settings(|settings| {
settings.set_embedder_settings(btreemap! {
S("doggo") => Setting::Set(EmbeddingSettings {
dimensions: Setting::Set(1),
source: Setting::Set(EmbedderSource::UserProvided),
..EmbeddingSettings::default()}),
});
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
}
}