Compare commits

..

1 Commits

Author SHA1 Message Date
Tamo
6832bde1f5 implement a first version of the clear indexes 2023-05-04 20:25:22 +02:00
31 changed files with 347 additions and 293 deletions

View File

@@ -13,7 +13,7 @@ license.workspace = true
[dependencies]
anyhow = "1.0.70"
csv = "1.2.1"
milli = { path = "../milli" }
milli = { path = "../milli", default-features = false }
mimalloc = { version = "0.1.36", default-features = false }
serde_json = { version = "1.0.95", features = ["preserve_order"] }
@@ -31,7 +31,7 @@ flate2 = "1.0.25"
reqwest = { version = "0.11.16", features = ["blocking", "rustls-tls"], default-features = false }
[features]
default = ["milli/all-tokenizations"]
default = ["milli/default"]
[[bench]]
name = "search_songs"

View File

@@ -110,6 +110,9 @@ pub enum KindDump {
allow_index_creation: bool,
},
IndexDeletion,
IndexClear {
index_uids: Vec<String>,
},
IndexCreation {
primary_key: Option<String>,
},
@@ -180,6 +183,7 @@ impl From<KindWithContent> for KindDump {
..
} => KindDump::Settings { settings: new_settings, is_deletion, allow_index_creation },
KindWithContent::IndexDeletion { .. } => KindDump::IndexDeletion,
KindWithContent::IndexClear { index_uids } => KindDump::IndexClear { index_uids },
KindWithContent::IndexCreation { primary_key, .. } => {
KindDump::IndexCreation { primary_key }
}
@@ -211,8 +215,8 @@ pub(crate) mod test {
use maplit::btreeset;
use meilisearch_types::index_uid_pattern::IndexUidPattern;
use meilisearch_types::keys::{Action, Key};
use meilisearch_types::milli;
use meilisearch_types::milli::update::Setting;
use meilisearch_types::milli::{self};
use meilisearch_types::settings::{Checked, Settings};
use meilisearch_types::tasks::{Details, Status};
use serde_json::{json, Map, Value};

View File

@@ -32,6 +32,7 @@ enum AutobatchKind {
},
IndexCreation,
IndexDeletion,
IndexClear,
IndexUpdate,
IndexSwap,
}
@@ -74,6 +75,7 @@ impl From<KindWithContent> for AutobatchKind {
}
}
KindWithContent::IndexDeletion { .. } => AutobatchKind::IndexDeletion,
KindWithContent::IndexClear { .. } => AutobatchKind::IndexClear,
KindWithContent::IndexCreation { .. } => AutobatchKind::IndexCreation,
KindWithContent::IndexUpdate { .. } => AutobatchKind::IndexUpdate,
KindWithContent::IndexSwap { .. } => AutobatchKind::IndexSwap,
@@ -123,6 +125,9 @@ pub enum BatchKind {
IndexDeletion {
ids: Vec<TaskId>,
},
IndexClear {
id: TaskId,
},
IndexCreation {
id: TaskId,
},
@@ -173,6 +178,7 @@ impl BatchKind {
match AutobatchKind::from(kind) {
K::IndexCreation => (Break(BatchKind::IndexCreation { id: task_id }), true),
K::IndexDeletion => (Break(BatchKind::IndexDeletion { ids: vec![task_id] }), false),
K::IndexClear => (Break(BatchKind::IndexClear { id: task_id }), false),
K::IndexUpdate => (Break(BatchKind::IndexUpdate { id: task_id }), false),
K::IndexSwap => (Break(BatchKind::IndexSwap { id: task_id }), false),
K::DocumentClear => (Continue(BatchKind::DocumentClear { ids: vec![task_id] }), false),
@@ -222,7 +228,7 @@ impl BatchKind {
match (self, kind) {
// We don't batch any of these operations
(this, K::IndexCreation | K::IndexUpdate | K::IndexSwap | K::DocumentDeletionByFilter) => Break(this),
(this, K::IndexCreation | K::IndexUpdate | K::IndexClear | K::IndexSwap | K::DocumentDeletionByFilter) => Break(this),
// We must not batch tasks that don't have the same index creation rights if the index doesn't already exists.
(this, kind) if !index_already_exists && this.allow_index_creation() == Some(false) && kind.allow_index_creation() == Some(true) => {
Break(this)
@@ -480,6 +486,7 @@ impl BatchKind {
(
BatchKind::IndexCreation { .. }
| BatchKind::IndexDeletion { .. }
| BatchKind::IndexClear { .. }
| BatchKind::IndexUpdate { .. }
| BatchKind::IndexSwap { .. }
| BatchKind::DocumentDeletionByFilter { .. },

View File

@@ -85,6 +85,10 @@ pub(crate) enum Batch {
tasks: Vec<Task>,
index_has_been_created: bool,
},
IndexClear {
index_uids: Vec<String>,
task: Task,
},
IndexSwap {
task: Task,
},
@@ -154,6 +158,7 @@ impl Batch {
| Batch::TaskDeletion(task)
| Batch::Dump(task)
| Batch::IndexCreation { task, .. }
| Batch::IndexClear { task, .. }
| Batch::IndexDocumentDeletionByFilter { task, .. }
| Batch::IndexUpdate { task, .. } => vec![task.uid],
Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
@@ -189,6 +194,7 @@ impl Batch {
| TaskDeletion(_)
| SnapshotCreation(_)
| Dump(_)
| IndexClear { .. }
| IndexSwap { .. } => None,
IndexOperation { op, .. } => Some(op.index_uid()),
IndexCreation { index_uid, .. }
@@ -453,6 +459,14 @@ impl IndexScheduler {
index_has_been_created: must_create_index,
tasks: self.get_existing_tasks(rtxn, ids)?,
})),
BatchKind::IndexClear { id } => {
let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
let index_uids = match &task.kind {
KindWithContent::IndexClear { index_uids } => index_uids.clone(),
_ => unreachable!(),
};
Ok(Some(Batch::IndexClear { index_uids, task }))
}
BatchKind::IndexSwap { id } => {
let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
Ok(Some(Batch::IndexSwap { task }))
@@ -1017,6 +1031,13 @@ impl IndexScheduler {
Ok(tasks)
}
Batch::IndexClear { index_uids, mut task } => {
let wtxn = self.env.write_txn()?;
self.index_mapper.delete_indexes(wtxn, index_uids, false)?;
task.status = Status::Succeeded;
Ok(vec![task])
}
Batch::IndexSwap { mut task } => {
let mut wtxn = self.env.write_txn()?;
let swaps = if let KindWithContent::IndexSwap { swaps } = &task.kind {

View File

@@ -173,19 +173,37 @@ impl IndexMapper {
}
}
pub fn delete_index(&self, wtxn: RwTxn, name: &str) -> Result<()> {
self.delete_indexes(wtxn, Some(name), true)
}
/// Removes the index from the mapping table and the in-memory index map
/// but keeps the associated tasks.
pub fn delete_index(&self, mut wtxn: RwTxn, name: &str) -> Result<()> {
let uuid = self
.index_mapping
.get(&wtxn, name)?
.ok_or_else(|| Error::IndexNotFound(name.to_string()))?;
pub fn delete_indexes(
&self,
mut wtxn: RwTxn,
names: impl IntoIterator<Item = impl AsRef<str>>,
error_on_missing_index: bool,
) -> Result<()> {
let indexes = names
.into_iter()
.map(|name| {
let name = name.as_ref().to_string();
let uuid = self
.index_mapping
.get(&wtxn, &name)?
.ok_or_else(|| Error::IndexNotFound(name.to_string()))?;
Ok((name, uuid))
})
.filter(|res| error_on_missing_index || res.is_ok())
.collect::<Result<Vec<_>>>()?;
// Not an error if the index had no stats in cache.
self.index_stats.delete(&mut wtxn, &uuid)?;
// Once we retrieved the UUID of the index we remove it from the mapping table.
assert!(self.index_mapping.delete(&mut wtxn, name)?);
for (name, uuid) in indexes.iter() {
// Not an error if the index had no stats in cache.
self.index_stats.delete(&mut wtxn, uuid)?;
// Once we retrieved the UUID of the index we remove it from the mapping table.
assert!(self.index_mapping.delete(&mut wtxn, name)?);
}
wtxn.commit()?;
@@ -203,51 +221,63 @@ impl IndexMapper {
// This can not be caused by indexation because deleting an index happens in the scheduler itself, so cannot be concurrent with indexation.
//
// In these situations, reporting the error through a panic is in order.
let closing_event = loop {
let mut lock = self.index_map.write().unwrap();
match lock.start_deletion(&uuid) {
Ok(env_closing) => break env_closing,
Err(Some(reopen)) => {
// drop the lock here so that we don't synchronously wait for the index to close.
drop(lock);
tries += 1;
if tries >= 100 {
panic!("Too many attempts to close index {name} prior to deletion.")
let indexes = indexes
.into_iter()
.map(|(name, uuid)| {
let closing_event = loop {
let mut lock = self.index_map.write().unwrap();
match lock.start_deletion(&uuid) {
Ok(env_closing) => break env_closing,
Err(Some(reopen)) => {
// drop the lock here so that we don't synchronously wait for the index to close.
drop(lock);
tries += 1;
if tries >= 100 {
panic!("Too many attempts to close index {name} prior to deletion.")
}
let reopen =
if let Some(reopen) = reopen.wait_timeout(Duration::from_secs(6)) {
reopen
} else {
continue;
};
reopen.close(&mut self.index_map.write().unwrap());
continue;
}
// TODO: what is this case, what does that mean?
Err(None) => return None,
}
let reopen = if let Some(reopen) = reopen.wait_timeout(Duration::from_secs(6)) {
reopen
} else {
continue;
};
reopen.close(&mut self.index_map.write().unwrap());
continue;
}
Err(None) => return Ok(()),
}
};
};
Some((name, uuid, closing_event))
})
.filter_map(|thingy| thingy)
.map(|(name, uuid, closing)| {
(name.to_string(), uuid, self.base_path.join(uuid.to_string()), closing)
})
.collect::<Vec<_>>();
let index_map = self.index_map.clone();
let index_path = self.base_path.join(uuid.to_string());
let index_name = name.to_string();
thread::Builder::new()
.name(String::from("index_deleter"))
.spawn(move || {
// We first wait to be sure that the previously opened index is effectively closed.
// This can take a lot of time, this is why we do that in a separate thread.
if let Some(closing_event) = closing_event {
closing_event.wait();
}
for (name, uuid, index_path, closing_event) in indexes {
// We first wait to be sure that the previously opened index is effectively closed.
// This can take a lot of time, this is why we do that in a separate thread.
if let Some(closing_event) = closing_event {
closing_event.wait();
}
// Then we remove the content from disk.
if let Err(e) = fs::remove_dir_all(&index_path) {
error!(
"An error happened when deleting the index {} ({}): {}",
index_name, uuid, e
);
}
// Then we remove the content from disk.
if let Err(e) = fs::remove_dir_all(&index_path) {
error!(
"An error happened when deleting the index {} ({}): {}",
name, uuid, e
);
}
// Finally we remove the entry from the index map.
index_map.write().unwrap().end_deletion(&uuid);
// Finally we remove the entry from the index map.
index_map.write().unwrap().end_deletion(&uuid);
}
})
.unwrap();

View File

@@ -1284,6 +1284,7 @@ impl<'a> Dump<'a> {
KindDump::IndexDeletion => KindWithContent::IndexDeletion {
index_uid: task.index_uid.ok_or(Error::CorruptedDump)?,
},
KindDump::IndexClear { index_uids } => KindWithContent::IndexClear { index_uids },
KindDump::IndexCreation { primary_key } => KindWithContent::IndexCreation {
index_uid: task.index_uid.ok_or(Error::CorruptedDump)?,
primary_key,

View File

@@ -258,6 +258,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) {
K::TaskCancelation { .. }
| K::TaskDeletion { .. }
| K::DumpCreation { .. }
| K::IndexClear { .. }
| K::SnapshotCreation => (),
};
if let Some(Details::IndexSwap { swaps }) = &mut task.details {

View File

@@ -22,7 +22,7 @@ file-store = { path = "../file-store" }
flate2 = "1.0.25"
fst = "0.4.7"
memmap2 = "0.5.10"
milli = { path = "../milli" }
milli = { path = "../milli", default-features = false }
roaring = { version = "0.10.1", features = ["serde"] }
serde = { version = "1.0.160", features = ["derive"] }
serde-cs = "0.2.4"
@@ -40,7 +40,7 @@ meili-snap = { path = "../meili-snap" }
[features]
# all specialized tokenizations
all-tokenizations = ["milli/all-tokenizations"]
default = ["milli/default"]
# chinese specialized tokenization
chinese = ["milli/chinese"]

View File

@@ -46,6 +46,7 @@ impl Task {
| SnapshotCreation
| TaskCancelation { .. }
| TaskDeletion { .. }
| IndexClear { .. }
| IndexSwap { .. } => None,
DocumentAdditionOrUpdate { index_uid, .. }
| DocumentDeletion { index_uid, .. }
@@ -72,6 +73,7 @@ impl Task {
| KindWithContent::DocumentClear { .. }
| KindWithContent::SettingsUpdate { .. }
| KindWithContent::IndexDeletion { .. }
| KindWithContent::IndexClear { .. }
| KindWithContent::IndexCreation { .. }
| KindWithContent::IndexUpdate { .. }
| KindWithContent::IndexSwap { .. }
@@ -111,6 +113,9 @@ pub enum KindWithContent {
is_deletion: bool,
allow_index_creation: bool,
},
IndexClear {
index_uids: Vec<String>,
},
IndexDeletion {
index_uid: String,
},
@@ -156,6 +161,7 @@ impl KindWithContent {
KindWithContent::SettingsUpdate { .. } => Kind::SettingsUpdate,
KindWithContent::IndexCreation { .. } => Kind::IndexCreation,
KindWithContent::IndexDeletion { .. } => Kind::IndexDeletion,
KindWithContent::IndexClear { .. } => Kind::IndexDeletion,
KindWithContent::IndexUpdate { .. } => Kind::IndexUpdate,
KindWithContent::IndexSwap { .. } => Kind::IndexSwap,
KindWithContent::TaskCancelation { .. } => Kind::TaskCancelation,
@@ -181,6 +187,7 @@ impl KindWithContent {
| IndexCreation { index_uid, .. }
| IndexUpdate { index_uid, .. }
| IndexDeletion { index_uid } => vec![index_uid],
IndexClear { index_uids } => index_uids.into_iter().map(|s| s.as_ref()).collect(),
IndexSwap { swaps } => {
let mut indexes = HashSet::<&str>::default();
for swap in swaps {
@@ -214,7 +221,9 @@ impl KindWithContent {
deleted_documents: None,
})
}
KindWithContent::DocumentClear { .. } | KindWithContent::IndexDeletion { .. } => {
KindWithContent::DocumentClear { .. }
| KindWithContent::IndexDeletion { .. }
| KindWithContent::IndexClear { .. } => {
Some(Details::ClearAll { deleted_documents: None })
}
KindWithContent::SettingsUpdate { new_settings, .. } => {
@@ -268,7 +277,7 @@ impl KindWithContent {
KindWithContent::SettingsUpdate { new_settings, .. } => {
Some(Details::SettingsUpdate { settings: new_settings.clone() })
}
KindWithContent::IndexDeletion { .. } => None,
KindWithContent::IndexDeletion { .. } | KindWithContent::IndexClear { .. } => None,
KindWithContent::IndexCreation { primary_key, .. }
| KindWithContent::IndexUpdate { primary_key, .. } => {
Some(Details::IndexInfo { primary_key: primary_key.clone() })
@@ -307,7 +316,7 @@ impl From<&KindWithContent> for Option<Details> {
KindWithContent::SettingsUpdate { new_settings, .. } => {
Some(Details::SettingsUpdate { settings: new_settings.clone() })
}
KindWithContent::IndexDeletion { .. } => None,
KindWithContent::IndexDeletion { .. } | KindWithContent::IndexClear { .. } => None,
KindWithContent::IndexCreation { primary_key, .. } => {
Some(Details::IndexInfo { primary_key: primary_key.clone() })
}

View File

@@ -106,7 +106,7 @@ vergen = { version = "7.5.1", default-features = false, features = ["git"] }
zip = { version = "0.6.4", optional = true }
[features]
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
default = ["analytics", "meilisearch-types/default", "mini-dashboard"]
analytics = ["segment"]
mini-dashboard = ["actix-web-static-files", "static-files", "anyhow", "cargo_toml", "hex", "reqwest", "sha-1", "tempfile", "zip"]
chinese = ["meilisearch-types/chinese"]

View File

@@ -31,7 +31,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(
web::resource("")
.route(web::get().to(list_indexes))
.route(web::post().to(SeqHandler(create_index))),
.route(web::post().to(SeqHandler(create_index)))
.route(web::delete().to(SeqHandler(delete_all_indexes))),
)
.service(
web::scope("/{index_uid}")
@@ -107,6 +108,22 @@ pub async fn list_indexes(
Ok(HttpResponse::Ok().json(ret))
}
pub async fn delete_all_indexes(
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_DELETE }>, Data<IndexScheduler>>,
_req: HttpRequest,
_analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let filters = index_scheduler.filters();
let indexes = index_scheduler.index_names()?;
let indexes = indexes.into_iter().filter(|uid| filters.is_index_authorized(uid)).collect();
let task = KindWithContent::IndexClear { index_uids: indexes };
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
Ok(HttpResponse::Accepted().json(task))
}
#[derive(Deserr, Debug)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct IndexCreateRequest {

View File

@@ -69,7 +69,7 @@ rand = {version = "0.8.5", features = ["small_rng"] }
fuzzcheck = "0.12.1"
[features]
all-tokenizations = [ "charabia/default" ]
default = [ "charabia/default" ]
# Use POSIX semaphores instead of SysV semaphores in LMDB
# For more information on this feature, see heed's Cargo.toml

View File

@@ -2,7 +2,7 @@ use std::cmp;
use crate::{relative_from_absolute_position, Position};
pub const MAX_DISTANCE: u32 = 4;
pub const MAX_DISTANCE: u32 = 8;
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
if lhs <= rhs {

View File

@@ -201,14 +201,12 @@ pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
#[cfg(test)]
mod test {
#[allow(unused_imports)]
use super::*;
use crate::index::tests::TempIndex;
#[cfg(feature = "japanese")]
#[cfg(feature = "default")]
#[test]
fn test_kanji_language_detection() {
use crate::index::tests::TempIndex;
let index = TempIndex::new();
index

View File

@@ -46,7 +46,7 @@ use super::logger::SearchLogger;
use super::query_graph::QueryNode;
use super::ranking_rule_graph::{
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph,
RankingRuleGraph, RankingRuleGraphTrait, TypoGraph,
};
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
@@ -54,12 +54,6 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::ranking_rule_graph::PathVisitor;
use crate::{Result, TermsMatchingStrategy};
pub type Words = GraphBasedRankingRule<WordsGraph>;
impl GraphBasedRankingRule<WordsGraph> {
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
Self::new_with_id("words".to_owned(), Some(terms_matching_strategy))
}
}
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
impl GraphBasedRankingRule<ProximityGraph> {
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {

View File

@@ -13,7 +13,6 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::ranking_rule_graph::{
Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
WordsCondition, WordsGraph,
};
use crate::search::new::ranking_rules::BoxRankingRule;
use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
@@ -25,12 +24,11 @@ pub enum SearchEvents {
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
ExtendResults { new: Vec<u32> },
WordsGraph { query_graph: QueryGraph },
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
TypoGraph { graph: RankingRuleGraph<TypoGraph> },
TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
WordsGraph { graph: RankingRuleGraph<WordsGraph> },
WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> },
FidGraph { graph: RankingRuleGraph<FidGraph> },
FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
PositionGraph { graph: RankingRuleGraph<PositionGraph> },
@@ -141,11 +139,8 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
let Some(location) = self.location.last() else { return };
match location {
Location::Words => {
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() {
self.events.push(SearchEvents::WordsGraph { graph: graph.clone() });
}
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() {
self.events.push(SearchEvents::WordsPaths { paths: paths.clone() });
if let Some(query_graph) = state.downcast_ref::<QueryGraph>() {
self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() });
}
}
Location::Typo => {
@@ -334,6 +329,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
SearchEvents::ExtendResults { new } => {
self.write_extend_results(new)?;
}
SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?,
SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::ProximityPaths { paths } => {
self.write_rr_graph_paths::<ProximityGraph>(paths)?;
@@ -342,10 +338,6 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
SearchEvents::TypoPaths { paths } => {
self.write_rr_graph_paths::<TypoGraph>(paths)?;
}
SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::WordsPaths { paths } => {
self.write_rr_graph_paths::<WordsGraph>(paths)?;
}
SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::FidPaths { paths } => {
self.write_rr_graph_paths::<FidGraph>(paths)?;
@@ -490,13 +482,13 @@ fill: \"#B6E2D3\"
}
Ok(())
}
// fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> {
// self.make_new_file_for_internal_state_if_needed()?;
fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> {
self.make_new_file_for_internal_state_if_needed()?;
// self.write_query_graph(&qg)?;
self.write_query_graph(&qg)?;
// Ok(())
// }
Ok(())
}
fn write_rr_graph<R: RankingRuleGraphTrait>(
&mut self,
graph: &RankingRuleGraph<R>,

View File

@@ -52,7 +52,7 @@ impl MatchingWords {
words.push(LocatedMatchingWords {
value: matching_words,
positions: located_term.positions.clone(),
is_prefix: term.is_prefix(),
is_prefix: term.is_cached_prefix(),
original_char_count: term.original_word(&ctx).chars().count(),
});
}
@@ -244,8 +244,6 @@ pub(crate) mod tests {
temp_index
.add_documents(documents!([
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
{ "id": 2, "name": "Westfália" },
{ "id": 3, "name": "Ŵôřlḑôle" },
]))
.unwrap();
temp_index
@@ -307,7 +305,7 @@ pub(crate) mod tests {
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
None
);
assert_eq!(
matching_words

View File

@@ -499,36 +499,17 @@ mod tests {
use charabia::TokenizerBuilder;
use matching_words::tests::temp_index_with_documents;
use super::super::located_query_terms_from_tokens;
use super::*;
use crate::index::tests::TempIndex;
use crate::{execute_search, SearchContext};
use crate::SearchContext;
impl<'a> MatcherBuilder<'a, &[u8]> {
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
let mut ctx = SearchContext::new(index, rtxn);
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
&mut ctx,
&Some(query.to_string()),
crate::TermsMatchingStrategy::default(),
false,
&None,
&None,
crate::search::new::GeoSortStrategy::default(),
0,
100,
Some(10),
&mut crate::DefaultSearchLogger,
&mut crate::DefaultSearchLogger,
)
.unwrap();
// consume context and located_query_terms to build MatchingWords.
let matching_words = match located_query_terms {
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
None => MatchingWords::default(),
};
MatcherBuilder::new(matching_words, TokenizerBuilder::new().build())
pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self {
let tokenizer = TokenizerBuilder::new().build();
let tokens = tokenizer.tokenize(query);
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
let matching_words = MatchingWords::new(ctx, query_terms);
Self::new(matching_words, TokenizerBuilder::new().build())
}
}
@@ -536,7 +517,8 @@ mod tests {
fn format_identity() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: false, crop: None };
@@ -563,7 +545,8 @@ mod tests {
fn format_highlight() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: true, crop: None };
@@ -606,7 +589,8 @@ mod tests {
fn highlight_unicode() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "world");
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "world");
let format_options = FormatOptions { highlight: true, crop: None };
// Text containing prefix match.
@@ -615,7 +599,7 @@ mod tests {
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Ŵôřlḑ</em>ôle"
@"<em>Ŵôřlḑôle</em>"
);
// Text containing unicode match.
@@ -627,7 +611,8 @@ mod tests {
@"<em>Ŵôřlḑ</em>"
);
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "westfali");
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "westfali");
let format_options = FormatOptions { highlight: true, crop: None };
// Text containing unicode match.
@@ -636,7 +621,7 @@ mod tests {
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Westfáli</em>a"
@"<em>Westfália</em>"
);
}
@@ -644,7 +629,8 @@ mod tests {
fn format_crop() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: false, crop: Some(10) };
@@ -741,7 +727,8 @@ mod tests {
fn format_highlight_crop() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: true, crop: Some(10) };
@@ -803,7 +790,8 @@ mod tests {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let text = "void void split the world void void.";
@@ -839,8 +827,8 @@ mod tests {
fn partial_matches() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let mut builder =
MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
let ctx = SearchContext::new(&temp_index, &rtxn);
let mut builder = MatcherBuilder::new_test(ctx, "the \"t he\" door \"do or\"");
builder.highlight_prefix("_".to_string());
builder.highlight_suffix("_".to_string());

View File

@@ -15,7 +15,11 @@ mod resolve_query_graph;
mod small_bitmap;
mod exact_attribute;
// TODO: documentation + comments
// implementation is currently an adaptation of the previous implementation to fit with the new model
mod sort;
// TODO: documentation + comments
mod words;
#[cfg(test)]
mod tests;
@@ -39,10 +43,10 @@ use ranking_rules::{
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
use roaring::RoaringBitmap;
use sort::Sort;
use words::Words;
use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use crate::search::new::distinct::apply_distinct_rule;
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
@@ -198,11 +202,6 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
let mut sorted_fields = HashSet::new();
let mut geo_sorted = false;
// Don't add the `words` ranking rule if the term matching strategy is `All`
if matches!(terms_matching_strategy, TermsMatchingStrategy::All) {
words = true;
}
let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![];
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
for rr in settings_ranking_rules {
@@ -398,8 +397,8 @@ pub fn execute_search(
None
};
let bucket_sort_output = if let Some(query_terms) = query_terms {
let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(new_located_query_terms);
let graph = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(query_terms);
let ranking_rules = get_ranking_rules_for_query_graph_search(
ctx,

View File

@@ -88,15 +88,12 @@ pub struct QueryGraph {
}
impl QueryGraph {
/// Build the query graph from the parsed user search query, return an updated list of the located query terms
/// which contains ngrams.
/// Build the query graph from the parsed user search query.
pub fn from_query(
ctx: &mut SearchContext,
// NOTE: the terms here must be consecutive
terms: &[LocatedQueryTerm],
) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
let mut new_located_query_terms = terms.to_vec();
) -> Result<QueryGraph> {
let nbr_typos = number_of_typos_allowed(ctx)?;
let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
@@ -110,11 +107,10 @@ impl QueryGraph {
let original_terms_len = terms.len();
for term_idx in 0..original_terms_len {
let mut new_nodes = vec![];
let new_node_idx = add_node(
&mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset {
term_subset: QueryTermSubset::full(terms[term_idx].value),
term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)),
positions: terms[term_idx].positions.clone(),
term_ids: term_idx as u8..=term_idx as u8,
}),
@@ -125,7 +121,6 @@ impl QueryGraph {
if let Some(ngram) =
query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
{
new_located_query_terms.push(ngram.clone());
let ngram_idx = add_node(
&mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset {
@@ -141,7 +136,6 @@ impl QueryGraph {
if let Some(ngram) =
query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
{
new_located_query_terms.push(ngram.clone());
let ngram_idx = add_node(
&mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset {
@@ -173,7 +167,7 @@ impl QueryGraph {
let mut graph = QueryGraph { root_node, end_node, nodes };
graph.build_initial_edges();
Ok((graph, new_located_query_terms))
Ok(graph)
}
/// Remove the given nodes, connecting all their predecessors to all their successors.

View File

@@ -470,9 +470,6 @@ impl QueryTerm {
pub fn is_cached_prefix(&self) -> bool {
self.zero_typo.use_prefix_db.is_some()
}
pub fn is_prefix(&self) -> bool {
self.is_prefix
}
pub fn original_word(&self, ctx: &SearchContext) -> String {
ctx.word_interner.get(self.original).clone()
}

View File

@@ -205,12 +205,18 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len());
self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
if cur_node == self.query_graph.end_node {
*costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
return true;
}
let mut node_stack = VecDeque::new();
*costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() {
node_stack.push_back(prev_node);
enqueued.insert(prev_node);
}
while let Some(cur_node) = node_stack.pop_front() {
let mut self_costs = Vec::<u64>::new();
let cur_node_edges = &self.edges_of_node.get(cur_node);
@@ -226,8 +232,13 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
self_costs.dedup();
*costs_to_end.get_mut(cur_node) = self_costs;
true
});
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
if !enqueued.contains(prev_node) {
node_stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
costs_to_end
}
@@ -236,9 +247,17 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
node_with_removed_outgoing_conditions: Interned<QueryNode>,
costs: &mut MappedInterner<QueryNode, Vec<u64>>,
) {
self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len());
let mut node_stack = VecDeque::new();
enqueued.insert(node_with_removed_outgoing_conditions);
node_stack.push_back(node_with_removed_outgoing_conditions);
'main_loop: while let Some(cur_node) = node_stack.pop_front() {
let mut costs_to_remove = FxHashSet::default();
costs_to_remove.extend(costs.get(cur_node).iter().copied());
for c in costs.get(cur_node) {
costs_to_remove.insert(*c);
}
let cur_node_edges = &self.edges_of_node.get(cur_node);
for edge_idx in cur_node_edges.iter() {
@@ -246,79 +265,23 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
for cost in costs.get(edge.dest_node).iter() {
costs_to_remove.remove(&(*cost + edge.cost as u64));
if costs_to_remove.is_empty() {
return false;
continue 'main_loop;
}
}
}
if costs_to_remove.is_empty() {
return false;
continue 'main_loop;
}
let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
for c in costs_to_remove {
new_costs.remove(&c);
}
*costs.get_mut(cur_node) = new_costs.into_iter().collect();
true
});
}
/// Traverse the graph backwards from the given node such that every time
/// a node is visited, we are guaranteed that all its successors either:
/// 1. have already been visited; OR
/// 2. were not reachable from the given node
pub fn traverse_breadth_first_backward(
&self,
from: Interned<QueryNode>,
mut visit: impl FnMut(Interned<QueryNode>) -> bool,
) {
let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
{
// go backward to get the set of all reachable nodes from the given node
// the nodes that are not reachable will be set as `visited`
let mut stack = VecDeque::new();
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
enqueued.insert(from);
stack.push_back(from);
while let Some(n) = stack.pop_front() {
if reachable.contains(n) {
continue;
}
reachable.insert(n);
for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
};
let mut unreachable_or_visited =
SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
for (n, _) in self.query_graph.nodes.iter() {
if !reachable.contains(n) {
unreachable_or_visited.insert(n);
}
}
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
let mut stack = VecDeque::new();
enqueued.insert(from);
stack.push_back(from);
while let Some(cur_node) = stack.pop_front() {
if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
stack.push_back(cur_node);
continue;
}
unreachable_or_visited.insert(cur_node);
if visit(cur_node) {
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node)
{
stack.push_back(prev_node);
enqueued.insert(prev_node);
}
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
if !enqueued.contains(prev_node) {
node_stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}

View File

@@ -20,8 +20,6 @@ mod position;
mod proximity;
/// Implementation of the `typo` ranking rule
mod typo;
/// Implementation of the `words` ranking rule
mod words;
use std::collections::BTreeSet;
use std::hash::Hash;
@@ -35,7 +33,6 @@ pub use position::{PositionCondition, PositionGraph};
pub use proximity::{ProximityCondition, ProximityGraph};
use roaring::RoaringBitmap;
pub use typo::{TypoCondition, TypoGraph};
pub use words::{WordsCondition, WordsGraph};
use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner};
use super::query_term::LocatedQueryTermSubset;

View File

@@ -111,16 +111,23 @@ impl RankingRuleGraphTrait for PositionGraph {
fn cost_from_position(sum_positions: u32) -> u32 {
match sum_positions {
0 => 0,
1 => 1,
2..=4 => 2,
5..=7 => 3,
8..=11 => 4,
12..=16 => 5,
17..=24 => 6,
25..=64 => 7,
65..=256 => 8,
257..=1024 => 9,
_ => 10,
0 | 1 | 2 | 3 => sum_positions,
4 | 5 => 4,
6 | 7 => 5,
8 | 9 => 6,
10 | 11 => 7,
12 | 13 => 8,
14 | 15 => 9,
16 | 17..=24 => 10,
25..=32 => 11,
33..=64 => 12,
65..=128 => 13,
129..=256 => 14,
257..=512 => 15,
513..=1024 => 16,
1025..=2048 => 17,
2049..=4096 => 18,
4097..=8192 => 19,
_ => 20,
}
}

View File

@@ -6,8 +6,6 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
use crate::Result;
const MAX_PROX: usize = crate::proximity::MAX_DISTANCE as usize;
pub fn build_edges(
_ctx: &mut SearchContext,
conditions_interner: &mut DedupInterner<ProximityCondition>,
@@ -37,7 +35,7 @@ pub fn build_edges(
}
let mut conditions = vec![];
for cost in right_ngram_length..(MAX_PROX + right_ngram_length) {
for cost in right_ngram_length..(7 + right_ngram_length) {
conditions.push((
cost as u32,
conditions_interner.insert(ProximityCondition::Uninit {
@@ -49,7 +47,7 @@ pub fn build_edges(
}
conditions.push((
(MAX_PROX + right_ngram_length) as u32,
(7 + right_ngram_length) as u32,
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
));

View File

@@ -1,49 +0,0 @@
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::SearchContext;
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct WordsCondition {
term: LocatedQueryTermSubset,
}
pub enum WordsGraph {}
impl RankingRuleGraphTrait for WordsGraph {
type Condition = WordsCondition;
fn resolve_condition(
ctx: &mut SearchContext,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let WordsCondition { term, .. } = condition;
// maybe compute_query_term_subset_docids should accept a universe as argument
let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
docids &= universe;
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
})
}
fn build_edges(
_ctx: &mut SearchContext,
conditions_interner: &mut DedupInterner<Self::Condition>,
_from: Option<&LocatedQueryTermSubset>,
to_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
Ok(vec![(
to_term.term_ids.len() as u32,
conditions_interner.insert(WordsCondition { term: to_term.clone() }),
)])
}
}

View File

@@ -11,10 +11,11 @@ It doesn't test properly:
- distinct attributes with arrays (because we know it's incorrect as well)
*/
use std::collections::HashSet;
use big_s::S;
use heed::RoTxn;
use maplit::hashset;
use std::collections::HashSet;
use super::collect_field_values;
use crate::index::tests::TempIndex;

View File

@@ -4,7 +4,7 @@ pub mod distinct;
pub mod exactness;
pub mod geo_sort;
pub mod integration;
#[cfg(feature = "all-tokenizations")]
#[cfg(feature = "default")]
pub mod language;
pub mod ngram_split_words;
pub mod proximity;

View File

@@ -0,0 +1,87 @@
use roaring::RoaringBitmap;
use super::logger::SearchLogger;
use super::query_graph::QueryNode;
use super::resolve_query_graph::compute_query_graph_docids;
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
use crate::{Result, TermsMatchingStrategy};
pub struct Words {
exhausted: bool, // TODO: remove
query_graph: Option<QueryGraph>,
nodes_to_remove: Vec<SmallBitmap<QueryNode>>,
terms_matching_strategy: TermsMatchingStrategy,
}
impl Words {
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
Self {
exhausted: true,
query_graph: None,
nodes_to_remove: vec![],
terms_matching_strategy,
}
}
}
impl<'ctx> RankingRule<'ctx, QueryGraph> for Words {
fn id(&self) -> String {
"words".to_owned()
}
fn start_iteration(
&mut self,
ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
_universe: &RoaringBitmap,
parent_query_graph: &QueryGraph,
) -> Result<()> {
self.exhausted = false;
self.query_graph = Some(parent_query_graph.clone());
self.nodes_to_remove = match self.terms_matching_strategy {
TermsMatchingStrategy::Last => {
let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(ctx);
ns.reverse();
ns
}
TermsMatchingStrategy::All => {
vec![]
}
};
Ok(())
}
fn next_bucket(
&mut self,
ctx: &mut SearchContext<'ctx>,
logger: &mut dyn SearchLogger<QueryGraph>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
if self.exhausted {
return Ok(None);
}
let Some(query_graph) = &mut self.query_graph else { panic!() };
logger.log_internal_state(query_graph);
let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?;
let child_query_graph = query_graph.clone();
if self.nodes_to_remove.is_empty() {
self.exhausted = true;
} else {
let nodes_to_remove = self.nodes_to_remove.pop().unwrap();
query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::<Vec<_>>());
}
Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))
}
fn end_iteration(
&mut self,
_ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.exhausted = true;
self.nodes_to_remove = vec![];
self.query_graph = None;
}
}

View File

@@ -91,7 +91,7 @@ fn document_word_positions_into_sorter(
while !word_positions_heap.is_empty() {
while let Some(peeked_word_position) = word_positions_heap.pop() {
ordered_peeked_word_positions.push(peeked_word_position);
if ordered_peeked_word_positions.len() == (MAX_DISTANCE - 1) as usize {
if ordered_peeked_word_positions.len() == 7 {
break;
}
}

View File

@@ -1581,7 +1581,7 @@ mod tests {
assert_eq!(count, 4);
}
#[cfg(feature = "chinese")]
#[cfg(feature = "default")]
#[test]
fn test_meilisearch_1714() {
let index = TempIndex::new();