4845: Fix perf regression facet strings r=ManyTheFish a=dureuill

Benchmarks between v1.9 and v1.10 show a performance regression of about x2 (+3dB regression) for most indexing workloads (+44s for hackernews).

[Benchmark interpretation in the engine weekly meeting](https://www.notion.so/meilisearch/Engine-weekly-4d49560d374c4a87b4e3d126a261d4a0?pvs=4#98a709683276450295fcfe1f8ea5cef3).

- Initial investigation pointed to #4819 as the origin of the regression.
- Further investigation points towards the hypernormalization of each facet value in `extract_facet_string_docids`
- Most of the slowdown is in `normalize_facet_strings`, and precisely in `detection.language()`.

This PR improves the situation (-10s compared with `main` for hackernews, so only +34s regression compared with `v1.9`) by skipping normalization when it can be skipped.

I'm not sure how to fix the root cause though. Should we skip facet locale normalization for now? Cc `@ManyTheFish` 

---

Tentative resolution options:

1. remove locale normalization from facet. I'm not sure why this is required, I believe we weren't doing this before, so maybe we can stop doing that again.
2. don't do language detection when it can be helped: won't help with the regressions in benchmark, but maybe we can skip language detection when the locales contain only one language?
3. use a faster language detection library: `@Kerollmops` told me about https://github.com/quickwit-oss/whichlang which bolsters x10 to x100 throughput compared with whatlang. Should we consider replacing whatlang with whichlang? Now I understand whichlang supports fewer languages than whatlang, so I also suggest:
4. use whichlang when the list of locales is empty (autodetection), or when it only contains locales that whichlang can detect. If the list of locales contains locales that whichlang *cannot* detect, **then** use whatlang instead.

---

> [!CAUTION]
> this PR contains a commit that adds detailed spans, that were used to detect which part of `extract_facet_string_docids` was taking too much time. As this commit adds spans that are called too often and adds 7s overhead, it should be removed before landing.

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot]
2024-08-19 06:29:48 +00:00
committed by GitHub
3 changed files with 158 additions and 13 deletions

View File

@@ -339,10 +339,18 @@ impl ValuesCollection {
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
let options = NormalizerOption { lossy: true, ..Default::default() };
let mut detection = StrDetection::new(facet_string, locales);
// Detect the language of the facet string only if several locales are explicitly provided.
let language = match locales {
Some(&[language]) => Some(language),
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
_ => None,
};
let token = Token {
lemma: std::borrow::Cow::Borrowed(facet_string),
script: detection.script(),
language: detection.language(),
language,
..Default::default()
};

View File

@@ -12,6 +12,7 @@ use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
use crate::localized_attributes_rules::LocalizedFieldIds;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::{
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
@@ -28,6 +29,116 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
if settings_diff.settings_update_only() {
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
} else {
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
extract_facet_string_docids_document_update(
docid_fid_facet_string,
indexer,
localized_field_ids,
)
}
}
/// Extracts the facet string and the documents ids where this facet string appear.
///
/// Returns a grenad reader with the list of extracted facet strings and
/// documents ids from the given chunk of docid facet string positions.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
localized_field_ids: &LocalizedFieldIds,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
);
let mut normalized_facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_deladd_btreeset_string,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
);
let mut buffer = Vec::new();
let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
&& deladd_reader.get(DelAdd::Addition).is_some();
if is_same_value {
continue;
}
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let (document_id_bytes, normalized_value_bytes) =
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
let normalized_value = str::from_utf8(normalized_value_bytes)?;
// Facet search normalization
{
let locales = localized_field_ids.locales(field_id);
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
// as the facet string is the same, we can put the deletion and addition in the same obkv.
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in deladd_reader.iter() {
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
obkv.insert(deladd_key, val)?;
}
obkv.finish()?;
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
}
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in deladd_reader.iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
}
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
}
/// Extracts the facet string and the documents ids where this facet string appear.
///
/// Returns a grenad reader with the list of extracted facet strings and
/// documents ids from the given chunk of docid facet string positions.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
@@ -60,6 +171,15 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
let are_same_locales = old_locales == new_locales;
if is_same_value && are_same_locales {
continue;
}
let (document_id_bytes, normalized_value_bytes) =
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
@@ -68,15 +188,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
// Facet search normalization
{
let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
let new_hyper_normalized_value = if are_same_locales {
&old_hyper_normalized_value
} else {
&normalize_facet_string(normalized_value, new_locales)
};
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
// if the facet string is the same, we can put the deletion and addition in the same obkv.
if old_hyper_normalized_value == new_hyper_normalized_value {
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
// nothing to do if we delete and re-add the value.
if is_same_value {
continue;
@@ -148,12 +270,21 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
/// Normalizes the facet string and truncates it to the max length.
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
let options = NormalizerOption { lossy: true, ..Default::default() };
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
let mut detection = StrDetection::new(facet_string, locales);
let script = detection.script();
// Detect the language of the facet string only if several locales are explicitly provided.
let language = match locales {
Some(&[language]) => Some(language),
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
_ => None,
};
let token = Token {
lemma: std::borrow::Cow::Borrowed(facet_string),
script: detection.script(),
language: detection.language(),
script,
language,
..Default::default()
};