mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-17 20:00:58 +00:00
Compare commits
9 Commits
prototype-
...
prototype-
Author | SHA1 | Date | |
---|---|---|---|
7e2fd82e41 | |||
24c0775c67 | |||
3092cf0448 | |||
37d4551e8e | |||
da48506f15 | |||
370d88f626 | |||
d34faa8f9c | |||
e5d0bef6d8 | |||
e704728ee7 |
@ -675,9 +675,6 @@ impl IndexScheduler {
|
||||
}
|
||||
|
||||
// 3. Snapshot every indexes
|
||||
// TODO we are opening all of the indexes it can be too much we should unload all
|
||||
// of the indexes we are trying to open. It would be even better to only unload
|
||||
// the ones that were opened by us. Or maybe use a LRU in the index mapper.
|
||||
for result in self.index_mapper.index_mapping.iter(&rtxn)? {
|
||||
let (name, uuid) = result?;
|
||||
let index = self.index_mapper.index(&rtxn, name)?;
|
||||
@ -714,6 +711,14 @@ impl IndexScheduler {
|
||||
// 5.3 Change the permission to make the snapshot readonly
|
||||
let mut permissions = file.metadata()?.permissions();
|
||||
permissions.set_readonly(true);
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
#[allow(clippy::non_octal_unix_permissions)]
|
||||
// rwxrwxrwx
|
||||
permissions.set_mode(0b100100100);
|
||||
}
|
||||
|
||||
file.set_permissions(permissions)?;
|
||||
|
||||
for task in &mut tasks {
|
||||
|
@ -100,6 +100,8 @@ pub async fn list_indexes(
|
||||
Ok(Some(IndexView::new(uid.to_string(), index)?))
|
||||
})?;
|
||||
// Won't cause to open all indexes because IndexView doesn't keep the `Index` opened.
|
||||
// error when trying to fix it: the trait `ExactSizeIterator` is not implemented for `Flatten<IntoIter<Option<IndexView>>>`
|
||||
#[allow(clippy::needless_collect)]
|
||||
let indexes: Vec<IndexView> = indexes.into_iter().flatten().collect();
|
||||
let ret = paginate.as_pagination().auto_paginate_sized(indexes.into_iter());
|
||||
|
||||
|
@ -378,6 +378,11 @@ pub fn perform_search(
|
||||
let mut tokenizer_buidler = TokenizerBuilder::default();
|
||||
tokenizer_buidler.create_char_map(true);
|
||||
|
||||
let script_lang_map = index.script_language(&rtxn)?;
|
||||
if !script_lang_map.is_empty() {
|
||||
tokenizer_buidler.allow_list(&script_lang_map);
|
||||
}
|
||||
|
||||
let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_buidler.build());
|
||||
formatter_builder.crop_marker(query.crop_marker);
|
||||
formatter_builder.highlight_prefix(query.highlight_pre_tag);
|
||||
|
@ -1,8 +1,8 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use actix_rt::time::sleep;
|
||||
use meilisearch::option::ScheduleSnapshot;
|
||||
use meilisearch::Opt;
|
||||
use tokio::time::sleep;
|
||||
|
||||
use crate::common::server::default_settings;
|
||||
use crate::common::{GetAllDocumentsOptions, Server};
|
||||
@ -23,21 +23,20 @@ macro_rules! verify_snapshot {
|
||||
};
|
||||
let (snapshot, _) = test(snapshot.clone()).await;
|
||||
let (orig, _) = test(orig.clone()).await;
|
||||
assert_eq!(snapshot, orig);
|
||||
assert_eq!(snapshot, orig, "Got \n{}\nWhile expecting:\n{}", serde_json::to_string_pretty(&snapshot).unwrap(), serde_json::to_string_pretty(&orig).unwrap());
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
#[ignore] // TODO: unignore
|
||||
async fn perform_snapshot() {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
let snapshot_dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let options = Opt {
|
||||
snapshot_dir: snapshot_dir.path().to_owned(),
|
||||
schedule_snapshot: ScheduleSnapshot::Enabled(1),
|
||||
schedule_snapshot: ScheduleSnapshot::Enabled(2),
|
||||
..default_settings(temp.path())
|
||||
};
|
||||
|
||||
@ -61,6 +60,16 @@ async fn perform_snapshot() {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
|
||||
let snapshot_path = snapshot_dir.path().to_owned().join("db.snapshot");
|
||||
#[cfg_attr(windows, allow(unused))]
|
||||
let snapshot_meta = std::fs::metadata(&snapshot_path).unwrap();
|
||||
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
let mode = snapshot_meta.permissions().mode();
|
||||
// rwxrwxrwx
|
||||
meili_snap::snapshot!(format!("{:b}", mode), @"1000000100100100");
|
||||
}
|
||||
|
||||
let options = Opt { import_snapshot: Some(snapshot_path), ..default_settings(temp.path()) };
|
||||
|
||||
@ -71,7 +80,10 @@ async fn perform_snapshot() {
|
||||
// for some reason the db sizes differ. this may be due to the compaction options we have
|
||||
// set when performing the snapshot
|
||||
//server.stats(),
|
||||
server.tasks(),
|
||||
|
||||
// The original instance contains the snapshotCreation task, while the snapshotted-instance does not. For this reason we need to compare the task queue **after** the task 4
|
||||
server.tasks_filter("?from=2"),
|
||||
|
||||
server.index("test").get_all_documents(GetAllDocumentsOptions::default()),
|
||||
server.index("test").settings(),
|
||||
server.index("test1").get_all_documents(GetAllDocumentsOptions::default()),
|
||||
|
@ -1211,11 +1211,22 @@ impl Index {
|
||||
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
|
||||
|
||||
let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new();
|
||||
let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new();
|
||||
let mut total = 0;
|
||||
for sl in self.script_language_docids.iter(rtxn)? {
|
||||
let ((script, language), docids) = sl?;
|
||||
|
||||
// keep only Languages that contains at least 1 document.
|
||||
if !soft_deleted_documents.is_superset(&docids) {
|
||||
let remaining_documents_count = (docids - &soft_deleted_documents).len();
|
||||
total += remaining_documents_count;
|
||||
if remaining_documents_count > 0 {
|
||||
script_language_doc_count.push((script, language, remaining_documents_count));
|
||||
}
|
||||
}
|
||||
|
||||
let threshold = total / 20; // 5% (arbitrar)
|
||||
for (script, language, count) in script_language_doc_count {
|
||||
if count > threshold {
|
||||
if let Some(languages) = script_language.get_mut(&script) {
|
||||
(*languages).push(language);
|
||||
} else {
|
||||
|
@ -3,12 +3,14 @@ use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::index_documents::MergeFn;
|
||||
use crate::{
|
||||
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||
};
|
||||
@ -33,7 +35,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut script_language_pair = HashMap::new();
|
||||
let mut script_language_docids = HashMap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
concat_u32s_array,
|
||||
@ -45,11 +47,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut field_buffer = String::new();
|
||||
let mut builder = TokenizerBuilder::new();
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
builder.stop_words(stop_words);
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
let tokenizer = builder.build();
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
@ -57,49 +59,121 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = obkv::KvReader::<FieldId>::new(value);
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
documents_ids.push(document_id);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
|
||||
for (index, token) in tokens {
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry = script_language_pair
|
||||
.entry((script, language))
|
||||
.or_insert_with(RoaringBitmap::new);
|
||||
entry.push(document_id);
|
||||
}
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(token.as_bytes());
|
||||
extract_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
&mut key_buffer,
|
||||
&mut field_buffer,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = absolute_from_relative_position(field_id, position);
|
||||
docid_word_positions_sorter
|
||||
.insert(&key_buffer, position.to_ne_bytes())?;
|
||||
// if we detect a potetial mistake in the language detection,
|
||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||
if script_language_word_count.values().any(potential_language_detection_error) {
|
||||
// build an allow list with the most frequent detected languages in the document.
|
||||
let script_language: HashMap<_, _> =
|
||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||
|
||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporar tokenizer including the allow list.
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
|
||||
script_language_word_count.clear();
|
||||
|
||||
// rerun the extraction.
|
||||
extract_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
&mut key_buffer,
|
||||
&mut field_buffer,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (script, languages_frequency) in script_language_word_count {
|
||||
for (language, _) in languages_frequency {
|
||||
let entry = script_language_docids
|
||||
.entry((script, language))
|
||||
.or_insert_with(RoaringBitmap::new);
|
||||
entry.push(document_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn extract_tokens_from_document<T: AsRef<[u8]>>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer<T>,
|
||||
max_positions_per_attributes: u32,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
field_buffer: &mut String,
|
||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, field_buffer) {
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// if a language has been detected for the token, we update the counter.
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry =
|
||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||
Some((_, n)) => *n += 1,
|
||||
None => entry.push((language, 1)),
|
||||
}
|
||||
}
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(token.as_bytes());
|
||||
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = absolute_from_relative_position(field_id, position);
|
||||
docid_word_positions_sorter.insert(&key_buffer, position.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (documents_ids, reader, script_language_pair))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
@ -183,3 +257,36 @@ fn process_tokens<'a>(
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
fn potential_language_detection_error(languages_frequency: &Vec<(Language, usize)>) -> bool {
|
||||
if languages_frequency.len() > 1 {
|
||||
let threshold = compute_laguage_frequency_threshold(languages_frequency);
|
||||
languages_frequency.iter().any(|(_, c)| *c <= threshold)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn most_frequent_languages(
|
||||
(script, languages_frequency): (&Script, &Vec<(Language, usize)>),
|
||||
) -> Option<(Script, Vec<Language>)> {
|
||||
if languages_frequency.len() > 1 {
|
||||
let threshold = compute_laguage_frequency_threshold(languages_frequency);
|
||||
|
||||
let languages: Vec<_> =
|
||||
languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect();
|
||||
|
||||
if languages.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some((*script, languages))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_laguage_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize {
|
||||
let total: usize = languages_frequency.iter().map(|(_, c)| c).sum();
|
||||
total / 10 // 10% is a completely arbitrar value.
|
||||
}
|
||||
|
Reference in New Issue
Block a user