mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-30 17:55:36 +00:00
Compare commits
18 Commits
v1.6.0-rc.
...
prototype-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d7bd3217d6 | ||
|
|
6210377b90 | ||
|
|
9e3d1e1bbd | ||
|
|
bceaf4f981 | ||
|
|
d29b301618 | ||
|
|
a6fa0b97ec | ||
|
|
38abfec611 | ||
|
|
84a5c304fc | ||
|
|
e93d36d5b9 | ||
|
|
95f8e21533 | ||
|
|
68f197624e | ||
|
|
b79b03d4e2 | ||
|
|
86270e6878 | ||
|
|
81b6128b29 | ||
|
|
5f5a486895 | ||
|
|
5f4fc6c955 | ||
|
|
1f5e8fc072 | ||
|
|
3f3462ab62 |
@@ -600,11 +600,12 @@ pub fn settings(
|
||||
),
|
||||
};
|
||||
|
||||
let embedders = index
|
||||
let embedders: BTreeMap<_, _> = index
|
||||
.embedding_configs(rtxn)?
|
||||
.into_iter()
|
||||
.map(|(name, config)| (name, Setting::Set(config.into())))
|
||||
.collect();
|
||||
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
||||
|
||||
Ok(Settings {
|
||||
displayed_attributes: match displayed_attributes {
|
||||
@@ -631,7 +632,7 @@ pub fn settings(
|
||||
typo_tolerance: Setting::Set(typo_tolerance),
|
||||
faceting: Setting::Set(faceting),
|
||||
pagination: Setting::Set(pagination),
|
||||
embedders: Setting::Set(embedders),
|
||||
embedders,
|
||||
_kind: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -458,7 +458,7 @@ make_setting_route!(
|
||||
json!({
|
||||
"proximity_precision": {
|
||||
"set": precision.is_some(),
|
||||
"value": precision,
|
||||
"value": precision.unwrap_or_default(),
|
||||
}
|
||||
}),
|
||||
Some(req),
|
||||
@@ -690,7 +690,8 @@ pub async fn update_all(
|
||||
"set": new_settings.distinct_attribute.as_ref().set().is_some()
|
||||
},
|
||||
"proximity_precision": {
|
||||
"set": new_settings.proximity_precision.as_ref().set().is_some()
|
||||
"set": new_settings.proximity_precision.as_ref().set().is_some(),
|
||||
"value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default()
|
||||
},
|
||||
"typo_tolerance": {
|
||||
"enabled": new_settings.typo_tolerance
|
||||
|
||||
@@ -735,6 +735,9 @@ pub fn perform_facet_search(
|
||||
if let Some(facet_query) = &facet_query {
|
||||
facet_search.query(facet_query);
|
||||
}
|
||||
if let Some(max_facets) = index.max_values_per_facet(&rtxn)? {
|
||||
facet_search.max_values(max_facets as usize);
|
||||
}
|
||||
|
||||
Ok(FacetSearchResult {
|
||||
facet_hits: facet_search.execute()?,
|
||||
@@ -897,6 +900,14 @@ fn format_fields<'a>(
|
||||
let mut matches_position = compute_matches.then(BTreeMap::new);
|
||||
let mut document = document.clone();
|
||||
|
||||
// reduce the formatted option list to the attributes that should be formatted,
|
||||
// instead of all the attributes to display.
|
||||
let formatting_fields_options: Vec<_> = formatted_options
|
||||
.iter()
|
||||
.filter(|(_, option)| option.should_format())
|
||||
.map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option))
|
||||
.collect();
|
||||
|
||||
// select the attributes to retrieve
|
||||
let displayable_names =
|
||||
displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
|
||||
@@ -905,13 +916,15 @@ fn format_fields<'a>(
|
||||
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
|
||||
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
|
||||
// highlighted.
|
||||
let format = formatted_options
|
||||
// Warn: The time to compute the format list scales with the number of fields to format;
|
||||
// cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity:
|
||||
// d*f where d is the total number of fields to display and f is the total number of fields to format.
|
||||
let format = formatting_fields_options
|
||||
.iter()
|
||||
.filter(|(field, _option)| {
|
||||
let name = field_ids_map.name(**field).unwrap();
|
||||
.filter(|(name, _option)| {
|
||||
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
|
||||
})
|
||||
.map(|(_, option)| *option)
|
||||
.map(|(_, option)| **option)
|
||||
.reduce(|acc, option| acc.merge(option));
|
||||
let mut infos = Vec::new();
|
||||
|
||||
@@ -1008,7 +1021,7 @@ fn format_value<'a>(
|
||||
let value = matcher.format(format_options);
|
||||
Value::String(value.into_owned())
|
||||
}
|
||||
None => Value::Number(number),
|
||||
None => Value::String(s),
|
||||
}
|
||||
}
|
||||
value => value,
|
||||
|
||||
@@ -77,8 +77,7 @@ async fn import_dump_v1_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -239,8 +238,7 @@ async fn import_dump_v1_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -387,8 +385,7 @@ async fn import_dump_v1_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -521,8 +518,7 @@ async fn import_dump_v2_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -667,8 +663,7 @@ async fn import_dump_v2_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -812,8 +807,7 @@ async fn import_dump_v2_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -946,8 +940,7 @@ async fn import_dump_v3_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1092,8 +1085,7 @@ async fn import_dump_v3_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1237,8 +1229,7 @@ async fn import_dump_v3_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1371,8 +1362,7 @@ async fn import_dump_v4_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1517,8 +1507,7 @@ async fn import_dump_v4_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1662,8 +1651,7 @@ async fn import_dump_v4_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1907,8 +1895,7 @@ async fn import_dump_v6_containing_experimental_features() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
|
||||
@@ -105,6 +105,24 @@ async fn more_advanced_facet_search() {
|
||||
snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_facet_search_with_max_values() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await;
|
||||
index.update_settings_filterable_attributes(json!(["genres"])).await;
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(2).await;
|
||||
|
||||
let (response, code) =
|
||||
index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await;
|
||||
|
||||
assert_eq!(code, 200, "{}", response);
|
||||
assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn non_filterable_facet_search_error() {
|
||||
let server = Server::new().await;
|
||||
|
||||
@@ -54,7 +54,7 @@ async fn get_settings() {
|
||||
let (response, code) = index.settings().await;
|
||||
assert_eq!(code, 200);
|
||||
let settings = response.as_object().unwrap();
|
||||
assert_eq!(settings.keys().len(), 16);
|
||||
assert_eq!(settings.keys().len(), 15);
|
||||
assert_eq!(settings["displayedAttributes"], json!(["*"]));
|
||||
assert_eq!(settings["searchableAttributes"], json!(["*"]));
|
||||
assert_eq!(settings["filterableAttributes"], json!([]));
|
||||
@@ -83,7 +83,6 @@ async fn get_settings() {
|
||||
"maxTotalHits": 1000,
|
||||
})
|
||||
);
|
||||
assert_eq!(settings["embedders"], json!({}));
|
||||
assert_eq!(settings["proximityPrecision"], json!("byWord"));
|
||||
}
|
||||
|
||||
|
||||
@@ -27,8 +27,8 @@ static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
||||
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||
|
||||
/// The maximum number of facets returned by the facet search route.
|
||||
const MAX_NUMBER_OF_FACETS: usize = 100;
|
||||
/// The maximum number of values per facet returned by the facet search route.
|
||||
const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100;
|
||||
|
||||
pub mod facet;
|
||||
mod fst_utils;
|
||||
@@ -306,6 +306,7 @@ pub struct SearchForFacetValues<'a> {
|
||||
query: Option<String>,
|
||||
facet: String,
|
||||
search_query: Search<'a>,
|
||||
max_values: usize,
|
||||
is_hybrid: bool,
|
||||
}
|
||||
|
||||
@@ -315,7 +316,13 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
search_query: Search<'a>,
|
||||
is_hybrid: bool,
|
||||
) -> SearchForFacetValues<'a> {
|
||||
SearchForFacetValues { query: None, facet, search_query, is_hybrid }
|
||||
SearchForFacetValues {
|
||||
query: None,
|
||||
facet,
|
||||
search_query,
|
||||
max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET,
|
||||
is_hybrid,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
|
||||
@@ -323,6 +330,11 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_values(&mut self, max: usize) -> &mut Self {
|
||||
self.max_values = max;
|
||||
self
|
||||
}
|
||||
|
||||
fn one_original_value_of(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
@@ -462,7 +474,7 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
.unwrap_or_else(|| left_bound.to_string());
|
||||
results.push(FacetValueHit { value, count });
|
||||
}
|
||||
if results.len() >= MAX_NUMBER_OF_FACETS {
|
||||
if results.len() >= self.max_values {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -507,7 +519,7 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
.unwrap_or_else(|| query.to_string());
|
||||
results.push(FacetValueHit { value, count });
|
||||
}
|
||||
if results.len() >= MAX_NUMBER_OF_FACETS {
|
||||
if results.len() >= self.max_values {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ pub struct BucketSortOutput {
|
||||
|
||||
// TODO: would probably be good to regroup some of these inside of a struct?
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||
|
||||
@@ -72,7 +72,7 @@ impl<'m> MatcherBuilder<'m> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
#[derive(Copy, Clone, Default, Debug)]
|
||||
pub struct FormatOptions {
|
||||
pub highlight: bool,
|
||||
pub crop: Option<usize>,
|
||||
@@ -82,6 +82,10 @@ impl FormatOptions {
|
||||
pub fn merge(self, other: Self) -> Self {
|
||||
Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
|
||||
}
|
||||
|
||||
pub fn should_format(&self) -> bool {
|
||||
self.highlight || self.crop.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
|
||||
@@ -191,6 +191,7 @@ fn resolve_maximally_reduced_query_graph(
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
#[logging_timer::time]
|
||||
fn resolve_universe(
|
||||
ctx: &mut SearchContext,
|
||||
initial_universe: &RoaringBitmap,
|
||||
@@ -556,6 +557,7 @@ pub fn execute_vector_search(
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext,
|
||||
query: Option<&str>,
|
||||
|
||||
@@ -5,6 +5,7 @@ use super::*;
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
#[logging_timer::time]
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext,
|
||||
query: NormalizedTokenIter,
|
||||
|
||||
@@ -61,6 +61,7 @@ impl FacetsUpdateIncremental {
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("FacetsUpdateIncremental::{}")]
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||
let mut cursor = self.delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
|
||||
@@ -76,26 +76,18 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
|
||||
pub const FACET_GROUP_SIZE: u8 = 4;
|
||||
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
use grenad::{CompressionType, SortAlgorithm};
|
||||
use heed::types::{Bytes, DecodeIgnore, SerdeJson};
|
||||
use heed::BytesEncode;
|
||||
use log::debug;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use self::incremental::FacetsUpdateIncremental;
|
||||
use super::FacetsUpdateBulk;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::index_documents::create_sorter;
|
||||
use crate::update::merge_btreeset_string;
|
||||
use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH};
|
||||
use crate::{Index, Result};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod incremental;
|
||||
@@ -146,115 +138,114 @@ impl<'i> FacetsUpdate<'i> {
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// See self::comparison_bench::benchmark_facet_indexing
|
||||
if self.delta_data.len() >= (self.database.len(wtxn)? / 50) {
|
||||
let field_ids =
|
||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
self.index,
|
||||
field_ids,
|
||||
self.facet_type,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
);
|
||||
bulk_update.execute(wtxn)?;
|
||||
} else {
|
||||
let incremental_update = FacetsUpdateIncremental::new(
|
||||
self.index,
|
||||
self.facet_type,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
self.max_group_size,
|
||||
);
|
||||
incremental_update.execute(wtxn)?;
|
||||
}
|
||||
|
||||
// We clear the list of normalized-for-search facets
|
||||
// and the previous FSTs to compute everything from scratch
|
||||
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
|
||||
self.index.facet_id_string_fst.clear(wtxn)?;
|
||||
|
||||
// As we can't use the same write transaction to read and write in two different databases
|
||||
// we must create a temporary sorter that we will write into LMDB afterward.
|
||||
// As multiple unnormalized facet values can become the same normalized facet value
|
||||
// we must merge them together.
|
||||
let mut sorter = create_sorter(
|
||||
SortAlgorithm::Unstable,
|
||||
merge_btreeset_string,
|
||||
CompressionType::None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
// if self.delta_data.len() >= (self.database.len(wtxn)? / 50) {
|
||||
let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
self.index,
|
||||
field_ids,
|
||||
self.facet_type,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
);
|
||||
bulk_update.execute(wtxn)?;
|
||||
// } else {
|
||||
// let incremental_update = FacetsUpdateIncremental::new(
|
||||
// self.index,
|
||||
// self.facet_type,
|
||||
// self.delta_data,
|
||||
// self.group_size,
|
||||
// self.min_level_size,
|
||||
// self.max_group_size,
|
||||
// );
|
||||
// incremental_update.execute(wtxn)?;
|
||||
// }
|
||||
|
||||
// We iterate on the list of original, semi-normalized, facet values
|
||||
// and normalize them for search, inserting them in LMDB in any given order.
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
|
||||
for result in database.iter(wtxn)? {
|
||||
let (facet_group_key, ()) = result?;
|
||||
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
||||
let mut normalized_facet = left_bound.normalize(&options);
|
||||
let normalized_truncated_facet: String;
|
||||
if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
|
||||
normalized_truncated_facet = normalized_facet
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
normalized_facet = normalized_truncated_facet.into();
|
||||
}
|
||||
let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
||||
let key = (field_id, normalized_facet.as_ref());
|
||||
let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
sorter.insert(key, val)?;
|
||||
}
|
||||
}
|
||||
// // We clear the list of normalized-for-search facets
|
||||
// // and the previous FSTs to compute everything from scratch
|
||||
// self.index.facet_id_normalized_string_strings.clear(wtxn)?;
|
||||
// self.index.facet_id_string_fst.clear(wtxn)?;
|
||||
|
||||
// In this loop we don't need to take care of merging bitmaps
|
||||
// as the grenad sorter already merged them for us.
|
||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
|
||||
self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put(
|
||||
wtxn,
|
||||
key_bytes,
|
||||
btreeset_bytes,
|
||||
)?;
|
||||
}
|
||||
// // As we can't use the same write transaction to read and write in two different databases
|
||||
// // we must create a temporary sorter that we will write into LMDB afterward.
|
||||
// // As multiple unnormalized facet values can become the same normalized facet value
|
||||
// // we must merge them together.
|
||||
// let mut sorter = create_sorter(
|
||||
// SortAlgorithm::Unstable,
|
||||
// merge_btreeset_string,
|
||||
// CompressionType::None,
|
||||
// None,
|
||||
// None,
|
||||
// None,
|
||||
// );
|
||||
|
||||
// We compute one FST by string facet
|
||||
let mut text_fsts = vec![];
|
||||
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||
let database =
|
||||
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||
for result in database.iter(wtxn)? {
|
||||
let ((field_id, normalized_facet), _) = result?;
|
||||
current_fst = match current_fst.take() {
|
||||
Some((fid, fst_builder)) if fid != field_id => {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((fid, fst));
|
||||
Some((field_id, fst::SetBuilder::memory()))
|
||||
}
|
||||
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||
None => Some((field_id, fst::SetBuilder::memory())),
|
||||
};
|
||||
// // We iterate on the list of original, semi-normalized, facet values
|
||||
// // and normalize them for search, inserting them in LMDB in any given order.
|
||||
// let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
// let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
|
||||
// for result in database.iter(wtxn)? {
|
||||
// let (facet_group_key, ()) = result?;
|
||||
// if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
||||
// let mut normalized_facet = left_bound.normalize(&options);
|
||||
// let normalized_truncated_facet: String;
|
||||
// if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
|
||||
// normalized_truncated_facet = normalized_facet
|
||||
// .char_indices()
|
||||
// .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
// .map(|(_, c)| c)
|
||||
// .collect();
|
||||
// normalized_facet = normalized_truncated_facet.into();
|
||||
// }
|
||||
// let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
||||
// let key = (field_id, normalized_facet.as_ref());
|
||||
// let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
// let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
// sorter.insert(key, val)?;
|
||||
// }
|
||||
// }
|
||||
|
||||
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||
fst_builder.insert(normalized_facet)?;
|
||||
}
|
||||
}
|
||||
// // In this loop we don't need to take care of merging bitmaps
|
||||
// // as the grenad sorter already merged them for us.
|
||||
// let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
// while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
|
||||
// self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put(
|
||||
// wtxn,
|
||||
// key_bytes,
|
||||
// btreeset_bytes,
|
||||
// )?;
|
||||
// }
|
||||
|
||||
if let Some((field_id, fst_builder)) = current_fst {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((field_id, fst));
|
||||
}
|
||||
// // We compute one FST by string facet
|
||||
// let mut text_fsts = vec![];
|
||||
// let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||
// let database =
|
||||
// self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||
// for result in database.iter(wtxn)? {
|
||||
// let ((field_id, normalized_facet), _) = result?;
|
||||
// current_fst = match current_fst.take() {
|
||||
// Some((fid, fst_builder)) if fid != field_id => {
|
||||
// let fst = fst_builder.into_set();
|
||||
// text_fsts.push((fid, fst));
|
||||
// Some((field_id, fst::SetBuilder::memory()))
|
||||
// }
|
||||
// Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||
// None => Some((field_id, fst::SetBuilder::memory())),
|
||||
// };
|
||||
|
||||
// We write those FSTs in LMDB now
|
||||
for (field_id, fst) in text_fsts {
|
||||
self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
||||
}
|
||||
// if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||
// fst_builder.insert(normalized_facet)?;
|
||||
// }
|
||||
// }
|
||||
|
||||
// if let Some((field_id, fst_builder)) = current_fst {
|
||||
// let fst = fst_builder.into_set();
|
||||
// text_fsts.push((field_id, fst));
|
||||
// }
|
||||
|
||||
// // We write those FSTs in LMDB now
|
||||
// for (field_id, fst) in text_fsts {
|
||||
// self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
||||
// }
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -123,6 +123,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
puffin::profile_function!(typed_chunk.to_debug_string());
|
||||
|
||||
log::debug!("Received a chunk to process: {}", typed_chunk.to_debug_string());
|
||||
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunk {
|
||||
TypedChunk::Documents(obkv_documents_iter) => {
|
||||
|
||||
Reference in New Issue
Block a user