Compare commits

..

1 Commits

Author SHA1 Message Date
JannisK89
7b7a27a3b2 removed unnecessary borrow call 2023-08-10 12:02:07 +02:00
70 changed files with 780 additions and 3237 deletions

View File

@@ -53,6 +53,5 @@ jobs:
uses: mislav/bump-homebrew-formula-action@v2 uses: mislav/bump-homebrew-formula-action@v2
with: with:
formula-name: meilisearch formula-name: meilisearch
formula-path: Formula/m/meilisearch.rb
env: env:
COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }} COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }}

View File

@@ -161,7 +161,7 @@ jobs:
- uses: actions-rs/toolchain@v1 - uses: actions-rs/toolchain@v1
with: with:
profile: minimal profile: minimal
toolchain: 1.71.1 toolchain: 1.69.0
override: true override: true
components: clippy components: clippy
- name: Cache dependencies - name: Cache dependencies

658
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,7 +18,7 @@ members = [
] ]
[workspace.package] [workspace.package]
version = "1.4.2" version = "1.3.0"
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"] authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
description = "Meilisearch HTTP server" description = "Meilisearch HTTP server"
homepage = "https://meilisearch.com" homepage = "https://meilisearch.com"

View File

@@ -262,9 +262,6 @@ pub(crate) mod test {
sortable_attributes: Setting::Set(btreeset! { S("age") }), sortable_attributes: Setting::Set(btreeset! { S("age") }),
ranking_rules: Setting::NotSet, ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet, stop_words: Setting::NotSet,
non_separator_tokens: Setting::NotSet,
separator_tokens: Setting::NotSet,
dictionary: Setting::NotSet,
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet, distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet, typo_tolerance: Setting::NotSet,

View File

@@ -340,9 +340,6 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
} }
}, },
stop_words: settings.stop_words.into(), stop_words: settings.stop_words.into(),
non_separator_tokens: v6::Setting::NotSet,
separator_tokens: v6::Setting::NotSet,
dictionary: v6::Setting::NotSet,
synonyms: settings.synonyms.into(), synonyms: settings.synonyms.into(),
distinct_attribute: settings.distinct_attribute.into(), distinct_attribute: settings.distinct_attribute.into(),
typo_tolerance: match settings.typo_tolerance { typo_tolerance: match settings.typo_tolerance {

View File

@@ -1,24 +0,0 @@
---
source: dump/src/reader/mod.rs
expression: spells.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness"
],
"stopWords": [],
"synonyms": {},
"distinctAttribute": null
}

View File

@@ -1,38 +0,0 @@
---
source: dump/src/reader/mod.rs
expression: products.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness"
],
"stopWords": [],
"synonyms": {
"android": [
"phone",
"smartphone"
],
"iphone": [
"phone",
"smartphone"
],
"phone": [
"android",
"iphone",
"smartphone"
]
},
"distinctAttribute": null
}

View File

@@ -1,31 +0,0 @@
---
source: dump/src/reader/mod.rs
expression: movies.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [
"genres",
"id"
],
"sortableAttributes": [
"genres",
"id"
],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness",
"release_date:asc"
],
"stopWords": [],
"synonyms": {},
"distinctAttribute": null
}

View File

@@ -14,7 +14,6 @@ license.workspace = true
[dependencies] [dependencies]
nom = "7.1.3" nom = "7.1.3"
nom_locate = "4.1.0" nom_locate = "4.1.0"
unescaper = "0.1.2"
[dev-dependencies] [dev-dependencies]
insta = "1.29.0" insta = "1.29.0"

View File

@@ -62,7 +62,6 @@ pub enum ErrorKind<'a> {
MisusedGeoRadius, MisusedGeoRadius,
MisusedGeoBoundingBox, MisusedGeoBoundingBox,
InvalidPrimary, InvalidPrimary,
InvalidEscapedNumber,
ExpectedEof, ExpectedEof,
ExpectedValue(ExpectedValueKind), ExpectedValue(ExpectedValueKind),
MalformedValue, MalformedValue,
@@ -148,9 +147,6 @@ impl<'a> Display for Error<'a> {
let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) }; let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)? writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
} }
ErrorKind::InvalidEscapedNumber => {
writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
}
ErrorKind::ExpectedEof => { ErrorKind::ExpectedEof => {
writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
} }

View File

@@ -545,8 +545,6 @@ impl<'a> std::fmt::Display for Token<'a> {
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use FilterCondition as Fc;
use super::*; use super::*;
/// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
@@ -558,22 +556,14 @@ pub mod tests {
unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into() unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
} }
#[test]
fn parse() {
use FilterCondition as Fc;
fn p(s: &str) -> impl std::fmt::Display + '_ { fn p(s: &str) -> impl std::fmt::Display + '_ {
Fc::parse(s).unwrap().unwrap() Fc::parse(s).unwrap().unwrap()
} }
#[test]
fn parse_escaped() {
insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
// but it also works with other sequencies
insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
}
#[test]
fn parse() {
// Test equal // Test equal
insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}"); insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}"); insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");

View File

@@ -171,24 +171,7 @@ pub fn parse_value(input: Span) -> IResult<Token> {
}) })
})?; })?;
match unescaper::unescape(value.value()) {
Ok(content) => {
if content.len() != value.value().len() {
Ok((input, Token::new(value.original_span(), Some(content))))
} else {
Ok((input, value)) Ok((input, value))
}
}
Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
value.original_span(),
ErrorKind::InvalidEscapedNumber,
))),
Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
value.original_span(),
ErrorKind::MalformedValue,
))),
}
} }
fn is_value_component(c: char) -> bool { fn is_value_component(c: char) -> bool {
@@ -335,17 +318,17 @@ pub mod test {
("\"cha'nnel\"", "cha'nnel", false), ("\"cha'nnel\"", "cha'nnel", false),
("I'm tamo", "I", false), ("I'm tamo", "I", false),
// escaped thing but not quote // escaped thing but not quote
(r#""\\""#, r#"\"#, true), (r#""\\""#, r#"\\"#, false),
(r#""\\\\\\""#, r#"\\\"#, true), (r#""\\\\\\""#, r#"\\\\\\"#, false),
(r#""aa\\aa""#, r#"aa\aa"#, true), (r#""aa\\aa""#, r#"aa\\aa"#, false),
// with double quote // with double quote
(r#""Hello \"world\"""#, r#"Hello "world""#, true), (r#""Hello \"world\"""#, r#"Hello "world""#, true),
(r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true), (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
(r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true), (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
(r#""\"\"""#, r#""""#, true), (r#""\"\"""#, r#""""#, true),
// with simple quote // with simple quote
(r#"'Hello \'world\''"#, r#"Hello 'world'"#, true), (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
(r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true), (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
(r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true), (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
(r#"'\'\''"#, r#"''"#, true), (r#"'\'\''"#, r#"''"#, true),
]; ];
@@ -367,14 +350,7 @@ pub mod test {
"Filter `{}` was not supposed to be escaped", "Filter `{}` was not supposed to be escaped",
input input
); );
assert_eq!( assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
token.value(),
expected,
"Filter `{}` failed by giving `{}` instead of `{}`.",
input,
token.value(),
expected
);
} }
} }

View File

@@ -13,7 +13,7 @@ license.workspace = true
[dependencies] [dependencies]
arbitrary = { version = "1.3.0", features = ["derive"] } arbitrary = { version = "1.3.0", features = ["derive"] }
clap = { version = "4.3.0", features = ["derive"] } clap = { version = "4.3.0", features = ["derive"] }
fastrand = "2.0.0" fastrand = "1.9.0"
milli = { path = "../milli" } milli = { path = "../milli" }
serde = { version = "1.0.160", features = ["derive"] } serde = { version = "1.0.160", features = ["derive"] }
serde_json = { version = "1.0.95", features = ["preserve_order"] } serde_json = { version = "1.0.95", features = ["preserve_order"] }

View File

@@ -67,6 +67,10 @@ pub(crate) enum Batch {
op: IndexOperation, op: IndexOperation,
must_create_index: bool, must_create_index: bool,
}, },
IndexDocumentDeletionByFilter {
index_uid: String,
task: Task,
},
IndexCreation { IndexCreation {
index_uid: String, index_uid: String,
primary_key: Option<String>, primary_key: Option<String>,
@@ -110,10 +114,6 @@ pub(crate) enum IndexOperation {
documents: Vec<Vec<String>>, documents: Vec<Vec<String>>,
tasks: Vec<Task>, tasks: Vec<Task>,
}, },
IndexDocumentDeletionByFilter {
index_uid: String,
task: Task,
},
DocumentClear { DocumentClear {
index_uid: String, index_uid: String,
tasks: Vec<Task>, tasks: Vec<Task>,
@@ -155,6 +155,7 @@ impl Batch {
| Batch::TaskDeletion(task) | Batch::TaskDeletion(task)
| Batch::Dump(task) | Batch::Dump(task)
| Batch::IndexCreation { task, .. } | Batch::IndexCreation { task, .. }
| Batch::IndexDocumentDeletionByFilter { task, .. }
| Batch::IndexUpdate { task, .. } => vec![task.uid], | Batch::IndexUpdate { task, .. } => vec![task.uid],
Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => { Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
tasks.iter().map(|task| task.uid).collect() tasks.iter().map(|task| task.uid).collect()
@@ -166,7 +167,6 @@ impl Batch {
| IndexOperation::DocumentClear { tasks, .. } => { | IndexOperation::DocumentClear { tasks, .. } => {
tasks.iter().map(|task| task.uid).collect() tasks.iter().map(|task| task.uid).collect()
} }
IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
IndexOperation::SettingsAndDocumentOperation { IndexOperation::SettingsAndDocumentOperation {
document_import_tasks: tasks, document_import_tasks: tasks,
settings_tasks: other, settings_tasks: other,
@@ -194,7 +194,8 @@ impl Batch {
IndexOperation { op, .. } => Some(op.index_uid()), IndexOperation { op, .. } => Some(op.index_uid()),
IndexCreation { index_uid, .. } IndexCreation { index_uid, .. }
| IndexUpdate { index_uid, .. } | IndexUpdate { index_uid, .. }
| IndexDeletion { index_uid, .. } => Some(index_uid), | IndexDeletion { index_uid, .. }
| IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid),
} }
} }
} }
@@ -204,7 +205,6 @@ impl IndexOperation {
match self { match self {
IndexOperation::DocumentOperation { index_uid, .. } IndexOperation::DocumentOperation { index_uid, .. }
| IndexOperation::DocumentDeletion { index_uid, .. } | IndexOperation::DocumentDeletion { index_uid, .. }
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
| IndexOperation::DocumentClear { index_uid, .. } | IndexOperation::DocumentClear { index_uid, .. }
| IndexOperation::Settings { index_uid, .. } | IndexOperation::Settings { index_uid, .. }
| IndexOperation::DocumentClearAndSetting { index_uid, .. } | IndexOperation::DocumentClearAndSetting { index_uid, .. }
@@ -239,12 +239,9 @@ impl IndexScheduler {
let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
match &task.kind { match &task.kind {
KindWithContent::DocumentDeletionByFilter { index_uid, .. } => { KindWithContent::DocumentDeletionByFilter { index_uid, .. } => {
Ok(Some(Batch::IndexOperation { Ok(Some(Batch::IndexDocumentDeletionByFilter {
op: IndexOperation::IndexDocumentDeletionByFilter {
index_uid: index_uid.clone(), index_uid: index_uid.clone(),
task, task,
},
must_create_index: false,
})) }))
} }
_ => unreachable!(), _ => unreachable!(),
@@ -899,6 +896,51 @@ impl IndexScheduler {
Ok(tasks) Ok(tasks)
} }
Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
let (index_uid, filter) =
if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } =
&task.kind
{
(index_uid, filter_expr)
} else {
unreachable!()
};
let index = {
let rtxn = self.env.read_txn()?;
self.index_mapper.index(&rtxn, index_uid)?
};
let deleted_documents = delete_document_by_filter(filter, index);
let original_filter = if let Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: _,
}) = task.details
{
original_filter
} else {
// In the case of a `documentDeleteByFilter` the details MUST be set
unreachable!();
};
match deleted_documents {
Ok(deleted_documents) => {
task.status = Status::Succeeded;
task.details = Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: Some(deleted_documents),
});
}
Err(e) => {
task.status = Status::Failed;
task.details = Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: Some(0),
});
task.error = Some(e.into());
}
}
Ok(vec![task])
}
Batch::IndexCreation { index_uid, primary_key, task } => { Batch::IndexCreation { index_uid, primary_key, task } => {
let wtxn = self.env.write_txn()?; let wtxn = self.env.write_txn()?;
if self.index_mapper.exists(&wtxn, &index_uid)? { if self.index_mapper.exists(&wtxn, &index_uid)? {
@@ -1257,47 +1299,6 @@ impl IndexScheduler {
Ok(tasks) Ok(tasks)
} }
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
let filter =
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
&task.kind
{
filter_expr
} else {
unreachable!()
};
let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
let original_filter = if let Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: _,
}) = task.details
{
original_filter
} else {
// In the case of a `documentDeleteByFilter` the details MUST be set
unreachable!();
};
match deleted_documents {
Ok(deleted_documents) => {
task.status = Status::Succeeded;
task.details = Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: Some(deleted_documents),
});
}
Err(e) => {
task.status = Status::Failed;
task.details = Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: Some(0),
});
task.error = Some(e.into());
}
}
Ok(vec![task])
}
IndexOperation::Settings { index_uid: _, settings, mut tasks } => { IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
let indexer_config = self.index_mapper.indexer_config(); let indexer_config = self.index_mapper.indexer_config();
let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config); let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
@@ -1497,22 +1498,23 @@ impl IndexScheduler {
} }
} }
fn delete_document_by_filter<'a>( fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result<u64> {
wtxn: &mut RwTxn<'a, '_>,
filter: &serde_json::Value,
index: &'a Index,
) -> Result<u64> {
let filter = Filter::from_json(filter)?; let filter = Filter::from_json(filter)?;
Ok(if let Some(filter) = filter { Ok(if let Some(filter) = filter {
let candidates = filter.evaluate(wtxn, index).map_err(|err| match err { let mut wtxn = index.write_txn()?;
let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter) Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
} }
e => e.into(), e => e.into(),
})?; })?;
let mut delete_operation = DeleteDocuments::new(wtxn, index)?; let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
delete_operation.delete_documents(&candidates); delete_operation.delete_documents(&candidates);
delete_operation.execute().map(|result| result.deleted_documents)? let deleted_documents =
delete_operation.execute().map(|result| result.deleted_documents)?;
wtxn.commit()?;
deleted_documents
} else { } else {
0 0
}) })

View File

@@ -790,19 +790,10 @@ impl IndexScheduler {
let mut res = BTreeMap::new(); let mut res = BTreeMap::new();
let processing_tasks = { self.processing_tasks.read().unwrap().processing.len() };
res.insert( res.insert(
"statuses".to_string(), "statuses".to_string(),
enum_iterator::all::<Status>() enum_iterator::all::<Status>()
.map(|s| { .map(|s| Ok((s.to_string(), self.get_status(&rtxn, s)?.len())))
let tasks = self.get_status(&rtxn, s)?.len();
match s {
Status::Enqueued => Ok((s.to_string(), tasks - processing_tasks)),
Status::Processing => Ok((s.to_string(), processing_tasks)),
s => Ok((s.to_string(), tasks)),
}
})
.collect::<Result<BTreeMap<String, u64>>>()?, .collect::<Result<BTreeMap<String, u64>>>()?,
); );
res.insert( res.insert(
@@ -4140,154 +4131,4 @@ mod tests {
snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "everything_has_been_processed"); snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "everything_has_been_processed");
drop(rtxn); drop(rtxn);
} }
#[test]
fn basic_get_stats() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let kind = index_creation_task("catto", "mouse");
let _task = index_scheduler.register(kind).unwrap();
let kind = index_creation_task("doggo", "sheep");
let _task = index_scheduler.register(kind).unwrap();
let kind = index_creation_task("whalo", "fish");
let _task = index_scheduler.register(kind).unwrap();
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
{
"indexes": {
"catto": 1,
"doggo": 1,
"whalo": 1
},
"statuses": {
"canceled": 0,
"enqueued": 3,
"failed": 0,
"processing": 0,
"succeeded": 0
},
"types": {
"documentAdditionOrUpdate": 0,
"documentDeletion": 0,
"dumpCreation": 0,
"indexCreation": 3,
"indexDeletion": 0,
"indexSwap": 0,
"indexUpdate": 0,
"settingsUpdate": 0,
"snapshotCreation": 0,
"taskCancelation": 0,
"taskDeletion": 0
}
}
"###);
handle.advance_till([Start, BatchCreated]);
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
{
"indexes": {
"catto": 1,
"doggo": 1,
"whalo": 1
},
"statuses": {
"canceled": 0,
"enqueued": 2,
"failed": 0,
"processing": 1,
"succeeded": 0
},
"types": {
"documentAdditionOrUpdate": 0,
"documentDeletion": 0,
"dumpCreation": 0,
"indexCreation": 3,
"indexDeletion": 0,
"indexSwap": 0,
"indexUpdate": 0,
"settingsUpdate": 0,
"snapshotCreation": 0,
"taskCancelation": 0,
"taskDeletion": 0
}
}
"###);
handle.advance_till([
InsideProcessBatch,
InsideProcessBatch,
ProcessBatchSucceeded,
AfterProcessing,
Start,
BatchCreated,
]);
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
{
"indexes": {
"catto": 1,
"doggo": 1,
"whalo": 1
},
"statuses": {
"canceled": 0,
"enqueued": 1,
"failed": 0,
"processing": 1,
"succeeded": 1
},
"types": {
"documentAdditionOrUpdate": 0,
"documentDeletion": 0,
"dumpCreation": 0,
"indexCreation": 3,
"indexDeletion": 0,
"indexSwap": 0,
"indexUpdate": 0,
"settingsUpdate": 0,
"snapshotCreation": 0,
"taskCancelation": 0,
"taskDeletion": 0
}
}
"###);
// now we make one more batch, the started_at field of the new tasks will be past `second_start_time`
handle.advance_till([
InsideProcessBatch,
InsideProcessBatch,
ProcessBatchSucceeded,
AfterProcessing,
Start,
BatchCreated,
]);
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
{
"indexes": {
"catto": 1,
"doggo": 1,
"whalo": 1
},
"statuses": {
"canceled": 0,
"enqueued": 0,
"failed": 0,
"processing": 1,
"succeeded": 2
},
"types": {
"documentAdditionOrUpdate": 0,
"documentDeletion": 0,
"dumpCreation": 0,
"indexCreation": 3,
"indexDeletion": 0,
"indexSwap": 0,
"indexUpdate": 0,
"settingsUpdate": 0,
"snapshotCreation": 0,
"taskCancelation": 0,
"taskDeletion": 0
}
}
"###);
}
} }

View File

@@ -167,9 +167,7 @@ macro_rules! snapshot {
let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, Some(&snap_name)); let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, Some(&snap_name));
settings.bind(|| { settings.bind(|| {
let snap = format!("{}", $value); let snap = format!("{}", $value);
insta::allow_duplicates! {
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap); meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
}
}); });
}; };
($value:expr, @$inline:literal) => { ($value:expr, @$inline:literal) => {
@@ -178,9 +176,7 @@ macro_rules! snapshot {
let (settings, _, _) = $crate::default_snapshot_settings_for_test("", Some("_dummy_argument")); let (settings, _, _) = $crate::default_snapshot_settings_for_test("", Some("_dummy_argument"));
settings.bind(|| { settings.bind(|| {
let snap = format!("{}", $value); let snap = format!("{}", $value);
insta::allow_duplicates! {
meili_snap::insta::assert_snapshot!(snap, @$inline); meili_snap::insta::assert_snapshot!(snap, @$inline);
}
}); });
}; };
($value:expr) => { ($value:expr) => {
@@ -198,9 +194,7 @@ macro_rules! snapshot {
let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, None); let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, None);
settings.bind(|| { settings.bind(|| {
let snap = format!("{}", $value); let snap = format!("{}", $value);
insta::allow_duplicates! {
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap); meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
}
}); });
}; };
} }

View File

@@ -15,13 +15,13 @@ actix-web = { version = "4.3.1", default-features = false }
anyhow = "1.0.70" anyhow = "1.0.70"
convert_case = "0.6.0" convert_case = "0.6.0"
csv = "1.2.1" csv = "1.2.1"
deserr = { version = "0.6.0", features = ["actix-web"]} deserr = "0.5.0"
either = { version = "1.8.1", features = ["serde"] } either = { version = "1.8.1", features = ["serde"] }
enum-iterator = "1.4.0" enum-iterator = "1.4.0"
file-store = { path = "../file-store" } file-store = { path = "../file-store" }
flate2 = "1.0.25" flate2 = "1.0.25"
fst = "0.4.7" fst = "0.4.7"
memmap2 = "0.7.1" memmap2 = "0.5.10"
milli = { path = "../milli" } milli = { path = "../milli" }
roaring = { version = "0.10.1", features = ["serde"] } roaring = { version = "0.10.1", features = ["serde"] }
serde = { version = "1.0.160", features = ["derive"] } serde = { version = "1.0.160", features = ["derive"] }

View File

@@ -259,9 +259,6 @@ InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSortableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsSortableAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsStopWords , InvalidRequest , BAD_REQUEST ; InvalidSettingsStopWords , InvalidRequest , BAD_REQUEST ;
InvalidSettingsNonSeparatorTokens , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSeparatorTokens , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDictionary , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSynonyms , InvalidRequest , BAD_REQUEST ; InvalidSettingsSynonyms , InvalidRequest , BAD_REQUEST ;
InvalidSettingsTypoTolerance , InvalidRequest , BAD_REQUEST ; InvalidSettingsTypoTolerance , InvalidRequest , BAD_REQUEST ;
InvalidState , Internal , INTERNAL_SERVER_ERROR ; InvalidState , Internal , INTERNAL_SERVER_ERROR ;

View File

@@ -171,15 +171,6 @@ pub struct Settings<T> {
#[deserr(default, error = DeserrJsonError<InvalidSettingsStopWords>)] #[deserr(default, error = DeserrJsonError<InvalidSettingsStopWords>)]
pub stop_words: Setting<BTreeSet<String>>, pub stop_words: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsNonSeparatorTokens>)]
pub non_separator_tokens: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsSeparatorTokens>)]
pub separator_tokens: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsDictionary>)]
pub dictionary: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsSynonyms>)] #[deserr(default, error = DeserrJsonError<InvalidSettingsSynonyms>)]
pub synonyms: Setting<BTreeMap<String, Vec<String>>>, pub synonyms: Setting<BTreeMap<String, Vec<String>>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
@@ -210,9 +201,6 @@ impl Settings<Checked> {
ranking_rules: Setting::Reset, ranking_rules: Setting::Reset,
stop_words: Setting::Reset, stop_words: Setting::Reset,
synonyms: Setting::Reset, synonyms: Setting::Reset,
non_separator_tokens: Setting::Reset,
separator_tokens: Setting::Reset,
dictionary: Setting::Reset,
distinct_attribute: Setting::Reset, distinct_attribute: Setting::Reset,
typo_tolerance: Setting::Reset, typo_tolerance: Setting::Reset,
faceting: Setting::Reset, faceting: Setting::Reset,
@@ -229,9 +217,6 @@ impl Settings<Checked> {
sortable_attributes, sortable_attributes,
ranking_rules, ranking_rules,
stop_words, stop_words,
non_separator_tokens,
separator_tokens,
dictionary,
synonyms, synonyms,
distinct_attribute, distinct_attribute,
typo_tolerance, typo_tolerance,
@@ -247,9 +232,6 @@ impl Settings<Checked> {
sortable_attributes, sortable_attributes,
ranking_rules, ranking_rules,
stop_words, stop_words,
non_separator_tokens,
separator_tokens,
dictionary,
synonyms, synonyms,
distinct_attribute, distinct_attribute,
typo_tolerance, typo_tolerance,
@@ -292,9 +274,6 @@ impl Settings<Unchecked> {
ranking_rules: self.ranking_rules, ranking_rules: self.ranking_rules,
stop_words: self.stop_words, stop_words: self.stop_words,
synonyms: self.synonyms, synonyms: self.synonyms,
non_separator_tokens: self.non_separator_tokens,
separator_tokens: self.separator_tokens,
dictionary: self.dictionary,
distinct_attribute: self.distinct_attribute, distinct_attribute: self.distinct_attribute,
typo_tolerance: self.typo_tolerance, typo_tolerance: self.typo_tolerance,
faceting: self.faceting, faceting: self.faceting,
@@ -356,28 +335,6 @@ pub fn apply_settings_to_builder(
Setting::NotSet => (), Setting::NotSet => (),
} }
match settings.non_separator_tokens {
Setting::Set(ref non_separator_tokens) => {
builder.set_non_separator_tokens(non_separator_tokens.clone())
}
Setting::Reset => builder.reset_non_separator_tokens(),
Setting::NotSet => (),
}
match settings.separator_tokens {
Setting::Set(ref separator_tokens) => {
builder.set_separator_tokens(separator_tokens.clone())
}
Setting::Reset => builder.reset_separator_tokens(),
Setting::NotSet => (),
}
match settings.dictionary {
Setting::Set(ref dictionary) => builder.set_dictionary(dictionary.clone()),
Setting::Reset => builder.reset_dictionary(),
Setting::NotSet => (),
}
match settings.synonyms { match settings.synonyms {
Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()), Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()),
Setting::Reset => builder.reset_synonyms(), Setting::Reset => builder.reset_synonyms(),
@@ -502,14 +459,15 @@ pub fn settings(
}) })
.transpose()? .transpose()?
.unwrap_or_default(); .unwrap_or_default();
let non_separator_tokens = index.non_separator_tokens(rtxn)?.unwrap_or_default();
let separator_tokens = index.separator_tokens(rtxn)?.unwrap_or_default();
let dictionary = index.dictionary(rtxn)?.unwrap_or_default();
let distinct_field = index.distinct_field(rtxn)?.map(String::from); let distinct_field = index.distinct_field(rtxn)?.map(String::from);
let synonyms = index.user_defined_synonyms(rtxn)?; // in milli each word in the synonyms map were split on their separator. Since we lost
// this information we are going to put space between words.
let synonyms = index
.synonyms(rtxn)?
.iter()
.map(|(key, values)| (key.join(" "), values.iter().map(|value| value.join(" ")).collect()))
.collect();
let min_typo_word_len = MinWordSizeTyposSetting { let min_typo_word_len = MinWordSizeTyposSetting {
one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?), one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?),
@@ -562,9 +520,6 @@ pub fn settings(
sortable_attributes: Setting::Set(sortable_attributes), sortable_attributes: Setting::Set(sortable_attributes),
ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()), ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()),
stop_words: Setting::Set(stop_words), stop_words: Setting::Set(stop_words),
non_separator_tokens: Setting::Set(non_separator_tokens),
separator_tokens: Setting::Set(separator_tokens),
dictionary: Setting::Set(dictionary),
distinct_attribute: match distinct_field { distinct_attribute: match distinct_field {
Some(field) => Setting::Set(field), Some(field) => Setting::Set(field),
None => Setting::Reset, None => Setting::Reset,
@@ -687,9 +642,6 @@ pub(crate) mod test {
sortable_attributes: Setting::NotSet, sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet, ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet, stop_words: Setting::NotSet,
non_separator_tokens: Setting::NotSet,
separator_tokens: Setting::NotSet,
dictionary: Setting::NotSet,
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet, distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet, typo_tolerance: Setting::NotSet,
@@ -711,9 +663,6 @@ pub(crate) mod test {
sortable_attributes: Setting::NotSet, sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet, ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet, stop_words: Setting::NotSet,
non_separator_tokens: Setting::NotSet,
separator_tokens: Setting::NotSet,
dictionary: Setting::NotSet,
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet, distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet, typo_tolerance: Setting::NotSet,

View File

@@ -39,7 +39,7 @@ byte-unit = { version = "4.0.19", default-features = false, features = [
bytes = "1.4.0" bytes = "1.4.0"
clap = { version = "4.2.1", features = ["derive", "env"] } clap = { version = "4.2.1", features = ["derive", "env"] }
crossbeam-channel = "0.5.8" crossbeam-channel = "0.5.8"
deserr = { version = "0.6.0", features = ["actix-web"]} deserr = "0.5.0"
dump = { path = "../dump" } dump = { path = "../dump" }
either = "1.8.1" either = "1.8.1"
env_logger = "0.10.0" env_logger = "0.10.0"
@@ -50,9 +50,9 @@ futures = "0.3.28"
futures-util = "0.3.28" futures-util = "0.3.28"
http = "0.2.9" http = "0.2.9"
index-scheduler = { path = "../index-scheduler" } index-scheduler = { path = "../index-scheduler" }
indexmap = { version = "2.0.0", features = ["serde"] } indexmap = { version = "1.9.3", features = ["serde-1"] }
is-terminal = "0.4.8" is-terminal = "0.4.8"
itertools = "0.11.0" itertools = "0.10.5"
jsonwebtoken = "8.3.0" jsonwebtoken = "8.3.0"
lazy_static = "1.4.0" lazy_static = "1.4.0"
log = "0.4.17" log = "0.4.17"
@@ -87,7 +87,7 @@ sha2 = "0.10.6"
siphasher = "0.3.10" siphasher = "0.3.10"
slice-group-by = "0.3.0" slice-group-by = "0.3.0"
static-files = { version = "0.2.3", optional = true } static-files = { version = "0.2.3", optional = true }
sysinfo = "0.29.7" sysinfo = "0.28.4"
tar = "0.4.38" tar = "0.4.38"
tempfile = "3.5.0" tempfile = "3.5.0"
thiserror = "1.0.40" thiserror = "1.0.40"

View File

@@ -20,7 +20,7 @@ pub struct SearchAggregator;
#[allow(dead_code)] #[allow(dead_code)]
impl SearchAggregator { impl SearchAggregator {
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self { pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
Self Self::default()
} }
pub fn succeed(&mut self, _: &dyn Any) {} pub fn succeed(&mut self, _: &dyn Any) {}
@@ -32,7 +32,7 @@ pub struct MultiSearchAggregator;
#[allow(dead_code)] #[allow(dead_code)]
impl MultiSearchAggregator { impl MultiSearchAggregator {
pub fn from_queries(_: &dyn Any, _: &dyn Any) -> Self { pub fn from_queries(_: &dyn Any, _: &dyn Any) -> Self {
Self Self::default()
} }
pub fn succeed(&mut self) {} pub fn succeed(&mut self) {}
@@ -44,7 +44,7 @@ pub struct FacetSearchAggregator;
#[allow(dead_code)] #[allow(dead_code)]
impl FacetSearchAggregator { impl FacetSearchAggregator {
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self { pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
Self Self::default()
} }
pub fn succeed(&mut self, _: &dyn Any) {} pub fn succeed(&mut self, _: &dyn Any) {}

View File

@@ -310,81 +310,6 @@ make_setting_route!(
} }
); );
make_setting_route!(
"/non-separator-tokens",
put,
std::collections::BTreeSet<String>,
meilisearch_types::deserr::DeserrJsonError<
meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens,
>,
non_separator_tokens,
"nonSeparatorTokens",
analytics,
|non_separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"nonSeparatorTokens Updated".to_string(),
json!({
"non_separator_tokens": {
"total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()),
},
}),
Some(req),
);
}
);
make_setting_route!(
"/separator-tokens",
put,
std::collections::BTreeSet<String>,
meilisearch_types::deserr::DeserrJsonError<
meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens,
>,
separator_tokens,
"separatorTokens",
analytics,
|separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"separatorTokens Updated".to_string(),
json!({
"separator_tokens": {
"total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()),
},
}),
Some(req),
);
}
);
make_setting_route!(
"/dictionary",
put,
std::collections::BTreeSet<String>,
meilisearch_types::deserr::DeserrJsonError<
meilisearch_types::error::deserr_codes::InvalidSettingsDictionary,
>,
dictionary,
"dictionary",
analytics,
|dictionary: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"dictionary Updated".to_string(),
json!({
"dictionary": {
"total": dictionary.as_ref().map(|dictionary| dictionary.len()),
},
}),
Some(req),
);
}
);
make_setting_route!( make_setting_route!(
"/synonyms", "/synonyms",
put, put,
@@ -541,9 +466,6 @@ generate_configure!(
searchable_attributes, searchable_attributes,
distinct_attribute, distinct_attribute,
stop_words, stop_words,
separator_tokens,
non_separator_tokens,
dictionary,
synonyms, synonyms,
ranking_rules, ranking_rules,
typo_tolerance, typo_tolerance,

View File

@@ -60,7 +60,8 @@ pub async fn swap_indexes(
} }
let task = KindWithContent::IndexSwap { swaps }; let task = KindWithContent::IndexSwap { swaps };
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); let task = index_scheduler.register(task)?;
let task: SummarizedTaskView = task.into();
Ok(HttpResponse::Accepted().json(task)) Ok(HttpResponse::Accepted().json(task))
} }

View File

@@ -491,20 +491,6 @@ pub fn perform_search(
tokenizer_builder.allow_list(&script_lang_map); tokenizer_builder.allow_list(&script_lang_map);
} }
let separators = index.allowed_separators(&rtxn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref separators) = separators {
tokenizer_builder.separators(separators);
}
let dictionary = index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref dictionary) = dictionary {
tokenizer_builder.words_dict(dictionary);
}
let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build()); let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build());
formatter_builder.crop_marker(query.crop_marker); formatter_builder.crop_marker(query.crop_marker);
formatter_builder.highlight_prefix(query.highlight_pre_tag); formatter_builder.highlight_prefix(query.highlight_pre_tag);
@@ -680,7 +666,6 @@ fn compute_semantic_score(query: &[f32], vectors: Value) -> milli::Result<Option
.map_err(InternalError::SerdeJson)?; .map_err(InternalError::SerdeJson)?;
Ok(vectors Ok(vectors
.into_iter() .into_iter()
.flatten()
.map(|v| OrderedFloat(dot_product_similarity(query, &v))) .map(|v| OrderedFloat(dot_product_similarity(query, &v)))
.max() .max()
.map(OrderedFloat::into_inner)) .map(OrderedFloat::into_inner))

View File

@@ -154,19 +154,6 @@ async fn delete_document_by_filter() {
) )
.await; .await;
index.wait_task(1).await; index.wait_task(1).await;
let (stats, _) = index.stats().await;
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 4,
"isIndexing": false,
"fieldDistribution": {
"color": 3,
"id": 4
}
}
"###);
let (response, code) = let (response, code) =
index.delete_document_by_filter(json!({ "filter": "color = blue"})).await; index.delete_document_by_filter(json!({ "filter": "color = blue"})).await;
snapshot!(code, @"202 Accepted"); snapshot!(code, @"202 Accepted");
@@ -201,18 +188,6 @@ async fn delete_document_by_filter() {
} }
"###); "###);
let (stats, _) = index.stats().await;
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"isIndexing": false,
"fieldDistribution": {
"color": 1,
"id": 2
}
}
"###);
let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(documents), @r###" snapshot!(json_string!(documents), @r###"
@@ -266,18 +241,6 @@ async fn delete_document_by_filter() {
} }
"###); "###);
let (stats, _) = index.stats().await;
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 1,
"isIndexing": false,
"fieldDistribution": {
"color": 1,
"id": 1
}
}
"###);
let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(documents), @r###" snapshot!(json_string!(documents), @r###"

File diff suppressed because it is too large Load Diff

View File

@@ -1,241 +0,0 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use crate::common::Server;
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"id": 1,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Brown"
},
{
"id": 2,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Black"
},
{
"id": 3,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Blue"
},
{
"id": 4,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": "Red"
},
{
"id": 5,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": "Blue"
},
{
"id": 6,
"description": "Running Shoes",
"brand": "Adidas",
"product_id": "456789",
"color": "Black"
},
{
"id": 7,
"description": "Running Shoes",
"brand": "Adidas",
"product_id": "456789",
"color": "White"
},
{
"id": 8,
"description": "Hoodie",
"brand": "Puma",
"product_id": "987654",
"color": "Gray"
},
{
"id": 9,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Green"
},
{
"id": 10,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Red"
},
{
"id": 11,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Blue"
},
{
"id": 12,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Indigo"
},
{
"id": 13,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Black"
},
{
"id": 14,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Stone Wash"
}
])
});
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "id";
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "product_id";
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
#[actix_rt::test]
async fn distinct_search_with_offset_no_ranking() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
index.wait_task(1).await;
fn get_hits(response: &Value) -> Vec<&str> {
let hits_array = response["hits"].as_array().unwrap();
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
}
let (response, code) = index.search_post(json!({"offset": 0, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
snapshot!(response["estimatedTotalHits"] , @"11");
let (response, code) = index.search_post(json!({"offset": 2, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
snapshot!(response["estimatedTotalHits"], @"10");
let (response, code) = index.search_post(json!({"offset": 4, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 5, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"1");
snapshot!(format!("{:?}", hits), @r#"["345678"]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 6, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 7, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["estimatedTotalHits"], @"6");
}
/// testing: https://github.com/meilisearch/meilisearch/issues/4130
#[actix_rt::test]
async fn distinct_search_with_pagination_no_ranking() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
index.wait_task(1).await;
fn get_hits(response: &Value) -> Vec<&str> {
let hits_array = response["hits"].as_array().unwrap();
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
}
let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["page"], @"0");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 1, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
snapshot!(response["page"], @"1");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
snapshot!(response["page"], @"2");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 3, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
snapshot!(response["page"], @"3");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 4, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["page"], @"4");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 3})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"3");
snapshot!(format!("{:?}", hits), @r#"["987654", "234567", "345678"]"#);
snapshot!(response["page"], @"2");
snapshot!(response["totalPages"], @"2");
snapshot!(response["totalHits"], @"6");
}

View File

@@ -1,4 +1,3 @@
use meili_snap::{json_string, snapshot};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use serde_json::{json, Value}; use serde_json::{json, Value};
@@ -61,59 +60,3 @@ async fn geo_sort_with_geo_strings() {
) )
.await; .await;
} }
#[actix_rt::test]
async fn geo_bounding_box_with_string_and_number() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.update_settings_filterable_attributes(json!(["_geo"])).await;
index.update_settings_sortable_attributes(json!(["_geo"])).await;
index.add_documents(documents, None).await;
index.wait_task(2).await;
index
.search(
json!({
"filter": "_geoBoundingBox([89, 179], [-89, -179])",
}),
|response, code| {
assert_eq!(code, 200, "{}", response);
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
{
"hits": [
{
"id": 1,
"name": "Taco Truck",
"address": "444 Salsa Street, Burritoville",
"type": "Mexican",
"rating": 9,
"_geo": {
"lat": 34.0522,
"lng": -118.2437
}
},
{
"id": 2,
"name": "La Bella Italia",
"address": "456 Elm Street, Townsville",
"type": "Italian",
"rating": 9,
"_geo": {
"lat": "45.4777599",
"lng": "9.1967508"
}
}
],
"query": "",
"processingTimeMs": "[time]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 2
}
"###);
},
)
.await;
}

View File

@@ -1,7 +1,6 @@
// This modules contains all the test concerning search. Each particular feature of the search // This modules contains all the test concerning search. Each particular feature of the search
// should be tested in its own module to isolate tests and keep the tests readable. // should be tested in its own module to isolate tests and keep the tests readable.
mod distinct;
mod errors; mod errors;
mod facet_search; mod facet_search;
mod formatted; mod formatted;
@@ -1105,59 +1104,3 @@ async fn camelcased_words() {
}) })
.await; .await;
} }
#[actix_rt::test]
async fn simple_search_with_strange_synonyms() {
let server = Server::new().await;
let index = server.index("test");
index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await;
let r = index.wait_task(0).await;
meili_snap::snapshot!(r["status"], @r###""succeeded""###);
let documents = DOCUMENTS.clone();
index.add_documents(documents, None).await;
index.wait_task(1).await;
index
.search(json!({"q": "How to train"}), |response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428"
}
]
"###);
})
.await;
index
.search(json!({"q": "How & train"}), |response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428"
}
]
"###);
})
.await;
index
.search(json!({"q": "to"}), |response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428"
}
]
"###);
})
.await;
}

View File

@@ -16,9 +16,6 @@ static DEFAULT_SETTINGS_VALUES: Lazy<HashMap<&'static str, Value>> = Lazy::new(|
json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]), json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]),
); );
map.insert("stop_words", json!([])); map.insert("stop_words", json!([]));
map.insert("non_separator_tokens", json!([]));
map.insert("separator_tokens", json!([]));
map.insert("dictionary", json!([]));
map.insert("synonyms", json!({})); map.insert("synonyms", json!({}));
map.insert( map.insert(
"faceting", "faceting",
@@ -54,7 +51,7 @@ async fn get_settings() {
let (response, code) = index.settings().await; let (response, code) = index.settings().await;
assert_eq!(code, 200); assert_eq!(code, 200);
let settings = response.as_object().unwrap(); let settings = response.as_object().unwrap();
assert_eq!(settings.keys().len(), 14); assert_eq!(settings.keys().len(), 11);
assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["displayedAttributes"], json!(["*"]));
assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"]));
assert_eq!(settings["filterableAttributes"], json!([])); assert_eq!(settings["filterableAttributes"], json!([]));
@@ -65,9 +62,6 @@ async fn get_settings() {
json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]) json!(["words", "typo", "proximity", "attribute", "sort", "exactness"])
); );
assert_eq!(settings["stopWords"], json!([])); assert_eq!(settings["stopWords"], json!([]));
assert_eq!(settings["nonSeparatorTokens"], json!([]));
assert_eq!(settings["separatorTokens"], json!([]));
assert_eq!(settings["dictionary"], json!([]));
assert_eq!( assert_eq!(
settings["faceting"], settings["faceting"],
json!({ json!({
@@ -278,9 +272,6 @@ test_setting_routes!(
searchable_attributes put, searchable_attributes put,
distinct_attribute put, distinct_attribute put,
stop_words put, stop_words put,
separator_tokens put,
non_separator_tokens put,
dictionary put,
ranking_rules put, ranking_rules put,
synonyms put, synonyms put,
pagination patch, pagination patch,

View File

@@ -1,4 +1,3 @@
mod distinct; mod distinct;
mod errors; mod errors;
mod get_settings; mod get_settings;
mod tokenizer_customization;

View File

@@ -1,467 +0,0 @@
use meili_snap::{json_string, snapshot};
use serde_json::json;
use crate::common::Server;
#[actix_rt::test]
async fn set_and_reset() {
let server = Server::new().await;
let index = server.index("test");
let (_response, _code) = index
.update_settings(json!({
"nonSeparatorTokens": ["#", "&"],
"separatorTokens": ["&sep", "<br/>"],
"dictionary": ["J.R.R.", "J. R. R."],
}))
.await;
index.wait_task(0).await;
let (response, _) = index.settings().await;
snapshot!(json_string!(response["nonSeparatorTokens"]), @r###"
[
"#",
"&"
]
"###);
snapshot!(json_string!(response["separatorTokens"]), @r###"
[
"&sep",
"<br/>"
]
"###);
snapshot!(json_string!(response["dictionary"]), @r###"
[
"J. R. R.",
"J.R.R."
]
"###);
index
.update_settings(json!({
"nonSeparatorTokens": null,
"separatorTokens": null,
"dictionary": null,
}))
.await;
index.wait_task(1).await;
let (response, _) = index.settings().await;
snapshot!(json_string!(response["nonSeparatorTokens"]), @"[]");
snapshot!(json_string!(response["separatorTokens"]), @"[]");
snapshot!(json_string!(response["dictionary"]), @"[]");
}
#[actix_rt::test]
async fn set_and_search() {
let documents = json!([
{
"id": 1,
"content": "Mac & cheese",
},
{
"id": 2,
"content": "G#D#G#D#G#C#D#G#C#",
},
{
"id": 3,
"content": "Mac&sep&&sepcheese",
},
]);
let server = Server::new().await;
let index = server.index("test");
index.add_documents(documents, None).await;
index.wait_task(0).await;
let (_response, _code) = index
.update_settings(json!({
"nonSeparatorTokens": ["#", "&"],
"separatorTokens": ["<br/>", "&sep"],
"dictionary": ["#", "A#", "B#", "C#", "D#", "E#", "F#", "G#"],
}))
.await;
index.wait_task(1).await;
index
.search(json!({"q": "&", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"content": "Mac & cheese",
"_formatted": {
"id": "1",
"content": "Mac <em>&</em> cheese"
}
},
{
"id": 3,
"content": "Mac&sep&&sepcheese",
"_formatted": {
"id": "3",
"content": "Mac&sep<em>&</em>&sepcheese"
}
}
]
"###);
})
.await;
index
.search(
json!({"q": "Mac & cheese", "attributesToHighlight": ["content"]}),
|response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"content": "Mac & cheese",
"_formatted": {
"id": "1",
"content": "<em>Mac</em> <em>&</em> <em>cheese</em>"
}
},
{
"id": 3,
"content": "Mac&sep&&sepcheese",
"_formatted": {
"id": "3",
"content": "<em>Mac</em>&sep<em>&</em>&sep<em>cheese</em>"
}
}
]
"###);
},
)
.await;
index
.search(
json!({"q": "Mac&sep&&sepcheese", "attributesToHighlight": ["content"]}),
|response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"content": "Mac & cheese",
"_formatted": {
"id": "1",
"content": "<em>Mac</em> <em>&</em> <em>cheese</em>"
}
},
{
"id": 3,
"content": "Mac&sep&&sepcheese",
"_formatted": {
"id": "3",
"content": "<em>Mac</em>&sep<em>&</em>&sep<em>cheese</em>"
}
}
]
"###);
},
)
.await;
index
.search(json!({"q": "C#D#G", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 2,
"content": "G#D#G#D#G#C#D#G#C#",
"_formatted": {
"id": "2",
"content": "<em>G</em>#<em>D#</em><em>G</em>#<em>D#</em><em>G</em>#<em>C#</em><em>D#</em><em>G</em>#<em>C#</em>"
}
}
]
"###);
})
.await;
index
.search(json!({"q": "#", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @"[]");
})
.await;
}
#[actix_rt::test]
async fn advanced_synergies() {
let documents = json!([
{
"id": 1,
"content": "J.R.R. Tolkien",
},
{
"id": 2,
"content": "J. R. R. Tolkien",
},
{
"id": 3,
"content": "jrr Tolkien",
},
{
"id": 4,
"content": "J.K. Rowlings",
},
{
"id": 5,
"content": "J. K. Rowlings",
},
{
"id": 6,
"content": "jk Rowlings",
},
]);
let server = Server::new().await;
let index = server.index("test");
index.add_documents(documents, None).await;
index.wait_task(0).await;
let (_response, _code) = index
.update_settings(json!({
"dictionary": ["J.R.R.", "J. R. R."],
"synonyms": {
"J.R.R.": ["jrr", "J. R. R."],
"J. R. R.": ["jrr", "J.R.R."],
"jrr": ["J.R.R.", "J. R. R."],
"J.K.": ["jk", "J. K."],
"J. K.": ["jk", "J.K."],
"jk": ["J.K.", "J. K."],
}
}))
.await;
index.wait_task(1).await;
index
.search(json!({"q": "J.R.R.", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"content": "J.R.R. Tolkien",
"_formatted": {
"id": "1",
"content": "<em>J.R.R.</em> Tolkien"
}
},
{
"id": 2,
"content": "J. R. R. Tolkien",
"_formatted": {
"id": "2",
"content": "<em>J. R. R.</em> Tolkien"
}
},
{
"id": 3,
"content": "jrr Tolkien",
"_formatted": {
"id": "3",
"content": "<em>jrr</em> Tolkien"
}
}
]
"###);
})
.await;
index
.search(json!({"q": "jrr", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 3,
"content": "jrr Tolkien",
"_formatted": {
"id": "3",
"content": "<em>jrr</em> Tolkien"
}
},
{
"id": 1,
"content": "J.R.R. Tolkien",
"_formatted": {
"id": "1",
"content": "<em>J.R.R.</em> Tolkien"
}
},
{
"id": 2,
"content": "J. R. R. Tolkien",
"_formatted": {
"id": "2",
"content": "<em>J. R. R.</em> Tolkien"
}
}
]
"###);
})
.await;
index
.search(json!({"q": "J. R. R.", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 2,
"content": "J. R. R. Tolkien",
"_formatted": {
"id": "2",
"content": "<em>J. R. R.</em> Tolkien"
}
},
{
"id": 1,
"content": "J.R.R. Tolkien",
"_formatted": {
"id": "1",
"content": "<em>J.R.R.</em> Tolkien"
}
},
{
"id": 3,
"content": "jrr Tolkien",
"_formatted": {
"id": "3",
"content": "<em>jrr</em> Tolkien"
}
}
]
"###);
})
.await;
// Only update dictionary, the synonyms should be recomputed.
let (_response, _code) = index
.update_settings(json!({
"dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."],
}))
.await;
index.wait_task(2).await;
index
.search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 6,
"content": "jk Rowlings",
"_formatted": {
"id": "6",
"content": "<em>jk</em> Rowlings"
}
},
{
"id": 4,
"content": "J.K. Rowlings",
"_formatted": {
"id": "4",
"content": "<em>J.K.</em> Rowlings"
}
},
{
"id": 5,
"content": "J. K. Rowlings",
"_formatted": {
"id": "5",
"content": "<em>J. K.</em> Rowlings"
}
}
]
"###);
})
.await;
index
.search(json!({"q": "J.K.", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 4,
"content": "J.K. Rowlings",
"_formatted": {
"id": "4",
"content": "<em>J.K.</em> Rowlings"
}
},
{
"id": 5,
"content": "J. K. Rowlings",
"_formatted": {
"id": "5",
"content": "<em>J. K.</em> Rowlings"
}
},
{
"id": 6,
"content": "jk Rowlings",
"_formatted": {
"id": "6",
"content": "<em>jk</em> Rowlings"
}
}
]
"###);
})
.await;
index
.search(json!({"q": "J. K.", "attributesToHighlight": ["content"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 5,
"content": "J. K. Rowlings",
"_formatted": {
"id": "5",
"content": "<em>J. K.</em> Rowlings"
}
},
{
"id": 4,
"content": "J.K. Rowlings",
"_formatted": {
"id": "4",
"content": "<em>J.K.</em> Rowlings"
}
},
{
"id": 6,
"content": "jk Rowlings",
"_formatted": {
"id": "6",
"content": "<em>jk</em> Rowlings"
}
},
{
"id": 2,
"content": "J. R. R. Tolkien",
"_formatted": {
"id": "2",
"content": "<em>J. R.</em> R. Tolkien"
}
}
]
"###);
})
.await;
}

View File

@@ -17,10 +17,10 @@ bincode = "1.3.3"
bstr = "1.4.0" bstr = "1.4.0"
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] } bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
byteorder = "1.4.3" byteorder = "1.4.3"
charabia = { version = "0.8.3", default-features = false } charabia = { version = "0.8.2", default-features = false }
concat-arrays = "0.1.2" concat-arrays = "0.1.2"
crossbeam-channel = "0.5.8" crossbeam-channel = "0.5.8"
deserr = { version = "0.6.0", features = ["actix-web"]} deserr = "0.5.0"
either = { version = "1.8.1", features = ["serde"] } either = { version = "1.8.1", features = ["serde"] }
flatten-serde-json = { path = "../flatten-serde-json" } flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7" fst = "0.4.7"
@@ -32,18 +32,18 @@ grenad = { version = "0.4.4", default-features = false, features = [
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
"lmdb", "read-txn-no-tls" "lmdb", "read-txn-no-tls"
] } ] }
indexmap = { version = "2.0.0", features = ["serde"] } indexmap = { version = "1.9.3", features = ["serde"] }
instant-distance = { version = "0.6.1", features = ["with-serde"] } instant-distance = { version = "0.6.1", features = ["with-serde"] }
json-depth-checker = { path = "../json-depth-checker" } json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memmap2 = "0.7.1" memmap2 = "0.5.10"
obkv = "0.2.0" obkv = "0.2.0"
once_cell = "1.17.1" once_cell = "1.17.1"
ordered-float = "3.6.0" ordered-float = "3.6.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] } rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.7.0" rayon = "1.7.0"
roaring = "0.10.1" roaring = "0.10.1"
rstar = { version = "0.11.0", features = ["serde"] } rstar = { version = "0.10.0", features = ["serde"] }
serde = { version = "1.0.160", features = ["derive"] } serde = { version = "1.0.160", features = ["derive"] }
serde_json = { version = "1.0.95", features = ["preserve_order"] } serde_json = { version = "1.0.95", features = ["preserve_order"] }
slice-group-by = "0.3.0" slice-group-by = "0.3.0"
@@ -63,7 +63,7 @@ uuid = { version = "1.3.1", features = ["v4"] }
filter-parser = { path = "../filter-parser" } filter-parser = { path = "../filter-parser" }
# documents words self-join # documents words self-join
itertools = "0.11.0" itertools = "0.10.5"
# profiling # profiling
puffin = "0.16.0" puffin = "0.16.0"

View File

@@ -1,5 +1,4 @@
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use std::{io, str}; use std::{io, str};
use obkv::KvReader; use obkv::KvReader;
@@ -20,14 +19,14 @@ use crate::FieldId;
pub struct EnrichedDocumentsBatchReader<R> { pub struct EnrichedDocumentsBatchReader<R> {
documents: DocumentsBatchReader<R>, documents: DocumentsBatchReader<R>,
primary_key: String, primary_key: String,
external_ids: grenad::ReaderCursor<BufReader<File>>, external_ids: grenad::ReaderCursor<File>,
} }
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> { impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
pub fn new( pub fn new(
documents: DocumentsBatchReader<R>, documents: DocumentsBatchReader<R>,
primary_key: String, primary_key: String,
external_ids: grenad::Reader<BufReader<File>>, external_ids: grenad::Reader<File>,
) -> Result<Self, Error> { ) -> Result<Self, Error> {
if documents.documents_count() as u64 == external_ids.len() { if documents.documents_count() as u64 == external_ids.len() {
Ok(EnrichedDocumentsBatchReader { Ok(EnrichedDocumentsBatchReader {
@@ -76,7 +75,7 @@ pub struct EnrichedDocument<'a> {
pub struct EnrichedDocumentsBatchCursor<R> { pub struct EnrichedDocumentsBatchCursor<R> {
documents: DocumentsBatchCursor<R>, documents: DocumentsBatchCursor<R>,
primary_key: String, primary_key: String,
external_ids: grenad::ReaderCursor<BufReader<File>>, external_ids: grenad::ReaderCursor<File>,
} }
impl<R> EnrichedDocumentsBatchCursor<R> { impl<R> EnrichedDocumentsBatchCursor<R> {

View File

@@ -122,28 +122,22 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
.field, .field,
match .valid_fields.is_empty() { match .valid_fields.is_empty() {
true => "This index does not have configured sortable attributes.".to_string(), true => "This index does not have configured sortable attributes.".to_string(),
false => format!("Available sortable attributes are: `{}{}`.", false => format!("Available sortable attributes are: `{}`.",
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "), valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
), ),
} }
)] )]
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool }, InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
#[error("Attribute `{}` is not facet-searchable. {}", #[error("Attribute `{}` is not facet-searchable. {}",
.field, .field,
match .valid_fields.is_empty() { match .valid_fields.is_empty() {
true => "This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.".to_string(), true => "This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.".to_string(),
false => format!("Available facet-searchable attributes are: `{}{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.", false => format!("Available facet-searchable attributes are: `{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.",
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "), valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
), ),
} }
)] )]
InvalidFacetSearchFacetName { InvalidFacetSearchFacetName { field: String, valid_fields: BTreeSet<String> },
field: String,
valid_fields: BTreeSet<String>,
hidden_fields: bool,
},
#[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.", #[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.",
.field, .field,
.valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "), .valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
@@ -346,11 +340,8 @@ fn conditionally_lookup_for_error_message() {
]; ];
for (list, suffix) in messages { for (list, suffix) in messages {
let err = UserError::InvalidSortableAttribute { let err =
field: "name".to_string(), UserError::InvalidSortableAttribute { field: "name".to_string(), valid_fields: list };
valid_fields: list,
hidden_fields: false,
};
assert_eq!(err.to_string(), format!("{} {}", prefix, suffix)); assert_eq!(err.to_string(), format!("{} {}", prefix, suffix));
} }

View File

@@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use std::fs::File; use std::fs::File;
use std::mem::size_of; use std::mem::size_of;
use std::path::Path; use std::path::Path;
@@ -61,12 +61,8 @@ pub mod main_key {
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
pub const STOP_WORDS_KEY: &str = "stop-words"; pub const STOP_WORDS_KEY: &str = "stop-words";
pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
pub const DICTIONARY_KEY: &str = "dictionary";
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
pub const SYNONYMS_KEY: &str = "synonyms"; pub const SYNONYMS_KEY: &str = "synonyms";
pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_FST_KEY: &str = "words-fst";
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
pub const CREATED_AT_KEY: &str = "created-at"; pub const CREATED_AT_KEY: &str = "created-at";
@@ -655,26 +651,6 @@ impl Index {
} }
} }
/* remove hidden fields */
pub fn remove_hidden_fields(
&self,
rtxn: &RoTxn,
fields: impl IntoIterator<Item = impl AsRef<str>>,
) -> Result<(BTreeSet<String>, bool)> {
let mut valid_fields =
fields.into_iter().map(|f| f.as_ref().to_string()).collect::<BTreeSet<String>>();
let fields_len = valid_fields.len();
if let Some(dn) = self.displayed_fields(rtxn)? {
let displayable_names = dn.iter().map(|s| s.to_string()).collect();
valid_fields = &valid_fields & &displayable_names;
}
let hidden_fields = fields_len > valid_fields.len();
Ok((valid_fields, hidden_fields))
}
/* searchable fields */ /* searchable fields */
/// Write the user defined searchable fields and generate the real searchable fields from the specified fields ids map. /// Write the user defined searchable fields and generate the real searchable fields from the specified fields ids map.
@@ -1079,116 +1055,18 @@ impl Index {
} }
} }
/* non separator tokens */
pub(crate) fn put_non_separator_tokens(
&self,
wtxn: &mut RwTxn,
set: &BTreeSet<String>,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::NON_SEPARATOR_TOKENS_KEY, set)
}
pub(crate) fn delete_non_separator_tokens(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::NON_SEPARATOR_TOKENS_KEY)
}
pub fn non_separator_tokens(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
Ok(self.main.get::<_, Str, SerdeBincode<BTreeSet<String>>>(
rtxn,
main_key::NON_SEPARATOR_TOKENS_KEY,
)?)
}
/* separator tokens */
pub(crate) fn put_separator_tokens(
&self,
wtxn: &mut RwTxn,
set: &BTreeSet<String>,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SEPARATOR_TOKENS_KEY, set)
}
pub(crate) fn delete_separator_tokens(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::SEPARATOR_TOKENS_KEY)
}
pub fn separator_tokens(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
Ok(self
.main
.get::<_, Str, SerdeBincode<BTreeSet<String>>>(rtxn, main_key::SEPARATOR_TOKENS_KEY)?)
}
/* separators easing method */
pub fn allowed_separators(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
let default_separators =
charabia::separators::DEFAULT_SEPARATORS.iter().map(|s| s.to_string());
let mut separators: Option<BTreeSet<_>> = None;
if let Some(mut separator_tokens) = self.separator_tokens(rtxn)? {
separator_tokens.extend(default_separators.clone());
separators = Some(separator_tokens);
}
if let Some(non_separator_tokens) = self.non_separator_tokens(rtxn)? {
separators = separators
.or_else(|| Some(default_separators.collect()))
.map(|separators| &separators - &non_separator_tokens);
}
Ok(separators)
}
/* dictionary */
pub(crate) fn put_dictionary(
&self,
wtxn: &mut RwTxn,
set: &BTreeSet<String>,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::DICTIONARY_KEY, set)
}
pub(crate) fn delete_dictionary(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::DICTIONARY_KEY)
}
pub fn dictionary(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
Ok(self
.main
.get::<_, Str, SerdeBincode<BTreeSet<String>>>(rtxn, main_key::DICTIONARY_KEY)?)
}
/* synonyms */ /* synonyms */
pub(crate) fn put_synonyms( pub(crate) fn put_synonyms(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>, synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
user_defined_synonyms: &BTreeMap<String, Vec<String>>,
) -> heed::Result<()> { ) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)?; self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)
self.main.put::<_, Str, SerdeBincode<_>>(
wtxn,
main_key::USER_DEFINED_SYNONYMS_KEY,
user_defined_synonyms,
)
} }
pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result<bool> { pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)?; self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)
self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SYNONYMS_KEY)
}
pub fn user_defined_synonyms(
&self,
rtxn: &RoTxn,
) -> heed::Result<BTreeMap<String, Vec<String>>> {
Ok(self
.main
.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::USER_DEFINED_SYNONYMS_KEY)?
.unwrap_or_default())
} }
pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> { pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
@@ -1840,11 +1718,11 @@ pub(crate) mod tests {
.unwrap(); .unwrap();
index index
.add_documents(documents!([ .add_documents(documents!([
{ "id": 0, "_geo": { "lat": "0", "lng": "0" } }, { "id": 0, "_geo": { "lat": 0, "lng": 0 } },
{ "id": 1, "_geo": { "lat": 0, "lng": "-175" } }, { "id": 1, "_geo": { "lat": 0, "lng": -175 } },
{ "id": 2, "_geo": { "lat": "0", "lng": 175 } }, { "id": 2, "_geo": { "lat": 0, "lng": 175 } },
{ "id": 3, "_geo": { "lat": 85, "lng": 0 } }, { "id": 3, "_geo": { "lat": 85, "lng": 0 } },
{ "id": 4, "_geo": { "lat": "-85", "lng": "0" } }, { "id": 4, "_geo": { "lat": -85, "lng": 0 } },
])) ]))
.unwrap(); .unwrap();

View File

@@ -97,7 +97,7 @@ const MAX_LMDB_KEY_LENGTH: usize = 500;
/// ///
/// This number is determined by the keys of the different facet databases /// This number is determined by the keys of the different facet databases
/// and adding a margin of safety. /// and adding a margin of safety.
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32; pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
/// The maximum length a word can be /// The maximum length a word can be
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
@@ -293,15 +293,15 @@ pub fn normalize_facet(original: &str) -> String {
#[derive(serde::Serialize, serde::Deserialize, Debug)] #[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(transparent)] #[serde(transparent)]
pub struct VectorOrArrayOfVectors { pub struct VectorOrArrayOfVectors {
#[serde(with = "either::serde_untagged_optional")] #[serde(with = "either::serde_untagged")]
inner: Option<either::Either<Vec<f32>, Vec<Vec<f32>>>>, inner: either::Either<Vec<f32>, Vec<Vec<f32>>>,
} }
impl VectorOrArrayOfVectors { impl VectorOrArrayOfVectors {
pub fn into_array_of_vectors(self) -> Option<Vec<Vec<f32>>> { pub fn into_array_of_vectors(self) -> Vec<Vec<f32>> {
match self.inner? { match self.inner {
either::Either::Left(vector) => Some(vec![vector]), either::Either::Left(vector) => vec![vector],
either::Either::Right(vectors) => Some(vectors), either::Either::Right(vectors) => vectors,
} }
} }
} }

View File

@@ -280,13 +280,9 @@ impl<'a> SearchForFacetValues<'a> {
let filterable_fields = index.filterable_fields(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?;
if !filterable_fields.contains(&self.facet) { if !filterable_fields.contains(&self.facet) {
let (valid_fields, hidden_fields) =
index.remove_hidden_fields(rtxn, filterable_fields)?;
return Err(UserError::InvalidFacetSearchFacetName { return Err(UserError::InvalidFacetSearchFacetName {
field: self.facet.clone(), field: self.facet.clone(),
valid_fields, valid_fields: filterable_fields.into_iter().collect(),
hidden_fields,
} }
.into()); .into());
} }

View File

@@ -53,22 +53,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
if excluded.contains(docid) { if excluded.contains(docid) {
continue; continue;
} }
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
results.push(docid); results.push(docid);
} }
let mut all_candidates = universe - excluded; let mut all_candidates = universe - excluded;
all_candidates.extend(results.iter().copied()); all_candidates.extend(results.iter().copied());
// drain the results of the skipped elements
// this **must** be done **after** writing the entire results in `all_candidates` to ensure
// e.g. estimatedTotalHits is correct.
if results.len() >= from {
results.drain(..from);
} else {
results.clear();
}
return Ok(BucketSortOutput { return Ok(BucketSortOutput {
scores: vec![Default::default(); results.len()], scores: vec![Default::default(); results.len()],
docids: results, docids: results,
@@ -102,12 +91,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
/// Update the universes accordingly and inform the logger. /// Update the universes accordingly and inform the logger.
macro_rules! back { macro_rules! back {
() => { () => {
// FIXME: temporarily disabled assert: see <https://github.com/meilisearch/meilisearch/pull/4013> assert!(
// assert!( ranking_rule_universes[cur_ranking_rule_index].is_empty(),
// ranking_rule_universes[cur_ranking_rule_index].is_empty(), "The ranking rule {} did not sort its bucket exhaustively",
// "The ranking rule {} did not sort its bucket exhaustively", ranking_rules[cur_ranking_rule_index].id()
// ranking_rules[cur_ranking_rule_index].id() );
// );
logger.end_iteration_ranking_rule( logger.end_iteration_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(), ranking_rules[cur_ranking_rule_index].as_ref(),

View File

@@ -418,11 +418,19 @@ impl<'t> Matcher<'t, '_> {
} else { } else {
match &self.matches { match &self.matches {
Some((tokens, matches)) => { Some((tokens, matches)) => {
// If the text has to be cropped,
// compute the best interval to crop around.
let matches = match format_options.crop {
Some(crop_size) if crop_size > 0 => {
self.find_best_match_interval(matches, crop_size)
}
_ => matches,
};
// If the text has to be cropped, // If the text has to be cropped,
// crop around the best interval. // crop around the best interval.
let (byte_start, byte_end) = match format_options.crop { let (byte_start, byte_end) = match format_options.crop {
Some(crop_size) if crop_size > 0 => { Some(crop_size) if crop_size > 0 => {
let matches = self.find_best_match_interval(matches, crop_size);
self.crop_bounds(tokens, matches, crop_size) self.crop_bounds(tokens, matches, crop_size)
} }
_ => (0, self.text.len()), _ => (0, self.text.len()),
@@ -442,11 +450,6 @@ impl<'t> Matcher<'t, '_> {
for m in matches { for m in matches {
let token = &tokens[m.token_position]; let token = &tokens[m.token_position];
// skip matches out of the crop window.
if token.byte_start < byte_start || token.byte_end > byte_end {
continue;
}
if byte_index < token.byte_start { if byte_index < token.byte_start {
formatted.push(&self.text[byte_index..token.byte_start]); formatted.push(&self.text[byte_index..token.byte_start]);
} }
@@ -797,37 +800,6 @@ mod tests {
); );
} }
#[test]
fn format_highlight_crop_phrase_query() {
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
let temp_index = TempIndex::new();
temp_index
.add_documents(documents!([
{ "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
]))
.unwrap();
let rtxn = temp_index.read_txn().unwrap();
let format_options = FormatOptions { highlight: true, crop: Some(10) };
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
let mut matcher = builder.build(text);
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…had the power to split <em>the</em> <em>world</em> between those who…"
);
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
let mut matcher = builder.build(text);
// should highlight "those" and the phrase "and those".
insta::assert_snapshot!(
matcher.format(format_options),
@"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
);
}
#[test] #[test]
fn smaller_crop_size() { fn smaller_crop_size() {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295

View File

@@ -20,7 +20,7 @@ mod sort;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
use std::collections::HashSet; use std::collections::{BTreeSet, HashSet};
use bucket_sort::{bucket_sort, BucketSortOutput}; use bucket_sort::{bucket_sort, BucketSortOutput};
use charabia::TokenizerBuilder; use charabia::TokenizerBuilder;
@@ -108,11 +108,24 @@ impl<'ctx> SearchContext<'ctx> {
(None, None) => continue, (None, None) => continue,
// The field is not searchable => User error // The field is not searchable => User error
(_fid, Some(false)) => { (_fid, Some(false)) => {
let (valid_fields, hidden_fields) = match searchable_names { let mut valid_fields: BTreeSet<_> =
Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?, fids_map.names().map(String::from).collect();
None => self.index.remove_hidden_fields(self.txn, fids_map.names())?,
};
// Filter by the searchable names
if let Some(sn) = searchable_names {
let searchable_names = sn.iter().map(|s| s.to_string()).collect();
valid_fields = &valid_fields & &searchable_names;
}
let searchable_count = valid_fields.len();
// Remove hidden fields
if let Some(dn) = self.index.displayed_fields(self.txn)? {
let displayable_names = dn.iter().map(|s| s.to_string()).collect();
valid_fields = &valid_fields & &displayable_names;
}
let hidden_fields = searchable_count > valid_fields.len();
let field = field_name.to_string(); let field = field_name.to_string();
return Err(UserError::InvalidSearchableAttribute { return Err(UserError::InvalidSearchableAttribute {
field, field,
@@ -475,20 +488,6 @@ pub fn execute_search(
tokbuilder.stop_words(stop_words); tokbuilder.stop_words(stop_words);
} }
let separators = ctx.index.allowed_separators(ctx.txn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref separators) = separators {
tokbuilder.separators(separators);
}
let dictionary = ctx.index.dictionary(ctx.txn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref dictionary) = dictionary {
tokbuilder.words_dict(dictionary);
}
let script_lang_map = ctx.index.script_language(ctx.txn)?; let script_lang_map = ctx.index.script_language(ctx.txn)?;
if !script_lang_map.is_empty() { if !script_lang_map.is_empty() {
tokbuilder.allow_list(&script_lang_map); tokbuilder.allow_list(&script_lang_map);
@@ -591,24 +590,16 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>
for asc_desc in sort_criteria { for asc_desc in sort_criteria {
match asc_desc.member() { match asc_desc.member() {
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
let (valid_fields, hidden_fields) =
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
return Err(UserError::InvalidSortableAttribute { return Err(UserError::InvalidSortableAttribute {
field: field.to_string(), field: field.to_string(),
valid_fields, valid_fields: sortable_fields.into_iter().collect(),
hidden_fields, })?
})?;
} }
Member::Geo(_) if !sortable_fields.contains("_geo") => { Member::Geo(_) if !sortable_fields.contains("_geo") => {
let (valid_fields, hidden_fields) =
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
return Err(UserError::InvalidSortableAttribute { return Err(UserError::InvalidSortableAttribute {
field: "_geo".to_string(), field: "_geo".to_string(),
valid_fields, valid_fields: sortable_fields.into_iter().collect(),
hidden_fields, })?
})?;
} }
_ => (), _ => (),
} }

View File

@@ -2,7 +2,7 @@ use std::io::Cursor;
use big_s::S; use big_s::S;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::{btreemap, hashset}; use maplit::{hashmap, hashset};
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
@@ -33,7 +33,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
S("tag"), S("tag"),
S("asc_desc_rank"), S("asc_desc_rank"),
}); });
builder.set_synonyms(btreemap! { builder.set_synonyms(hashmap! {
S("hello") => vec![S("good morning")], S("hello") => vec![S("good morning")],
S("world") => vec![S("earth")], S("world") => vec![S("earth")],
S("america") => vec![S("the united states")], S("america") => vec![S("the united states")],

View File

@@ -15,7 +15,7 @@ they store fewer sprximities than the regular word sprximity DB.
*/ */
use std::collections::BTreeMap; use std::collections::HashMap;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::search::new::tests::collect_field_values; use crate::search::new::tests::collect_field_values;
@@ -336,7 +336,7 @@ fn test_proximity_split_word() {
index index
.update_settings(|s| { .update_settings(|s| {
let mut syns = BTreeMap::new(); let mut syns = HashMap::new();
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]); syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
s.set_synonyms(syns); s.set_synonyms(syns);
}) })

View File

@@ -18,7 +18,7 @@ if `words` doesn't exist before it.
14. Synonyms cost nothing according to the typo ranking rule 14. Synonyms cost nothing according to the typo ranking rule
*/ */
use std::collections::BTreeMap; use std::collections::HashMap;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::search::new::tests::collect_field_values; use crate::search::new::tests::collect_field_values;
@@ -591,7 +591,7 @@ fn test_typo_synonyms() {
.update_settings(|s| { .update_settings(|s| {
s.set_criteria(vec![Criterion::Typo]); s.set_criteria(vec![Criterion::Typo]);
let mut synonyms = BTreeMap::new(); let mut synonyms = HashMap::new();
synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]); synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]);
synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]); synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]);

View File

@@ -1,6 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use grenad::CompressionType; use grenad::CompressionType;
use heed::types::ByteSlice; use heed::types::ByteSlice;
@@ -31,7 +30,7 @@ pub struct FacetsUpdateBulk<'i> {
facet_type: FacetType, facet_type: FacetType,
field_ids: Vec<FieldId>, field_ids: Vec<FieldId>,
// None if level 0 does not need to be updated // None if level 0 does not need to be updated
new_data: Option<grenad::Reader<BufReader<File>>>, new_data: Option<grenad::Reader<File>>,
} }
impl<'i> FacetsUpdateBulk<'i> { impl<'i> FacetsUpdateBulk<'i> {
@@ -39,7 +38,7 @@ impl<'i> FacetsUpdateBulk<'i> {
index: &'i Index, index: &'i Index,
field_ids: Vec<FieldId>, field_ids: Vec<FieldId>,
facet_type: FacetType, facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>, new_data: grenad::Reader<File>,
group_size: u8, group_size: u8,
min_level_size: u8, min_level_size: u8,
) -> FacetsUpdateBulk<'i> { ) -> FacetsUpdateBulk<'i> {
@@ -188,7 +187,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
&self, &self,
field_id: FieldId, field_id: FieldId,
txn: &RoTxn, txn: &RoTxn,
) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> { ) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
let mut all_docids = RoaringBitmap::new(); let mut all_docids = RoaringBitmap::new();
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
for bitmap in bitmaps { for bitmap in bitmaps {
@@ -260,7 +259,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
field_id: u16, field_id: u16,
level: u8, level: u8,
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
) -> Result<Vec<grenad::Reader<BufReader<File>>>> { ) -> Result<Vec<grenad::Reader<File>>> {
if level == 0 { if level == 0 {
self.read_level_0(rtxn, field_id, handle_group)?; self.read_level_0(rtxn, field_id, handle_group)?;
// Level 0 is already in the database // Level 0 is already in the database

View File

@@ -1,6 +1,5 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use heed::types::{ByteSlice, DecodeIgnore}; use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesDecode, Error, RoTxn, RwTxn}; use heed::{BytesDecode, Error, RoTxn, RwTxn};
@@ -35,14 +34,14 @@ pub struct FacetsUpdateIncremental<'i> {
index: &'i Index, index: &'i Index,
inner: FacetsUpdateIncrementalInner, inner: FacetsUpdateIncrementalInner,
facet_type: FacetType, facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>, new_data: grenad::Reader<File>,
} }
impl<'i> FacetsUpdateIncremental<'i> { impl<'i> FacetsUpdateIncremental<'i> {
pub fn new( pub fn new(
index: &'i Index, index: &'i Index,
facet_type: FacetType, facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>, new_data: grenad::Reader<File>,
group_size: u8, group_size: u8,
min_level_size: u8, min_level_size: u8,
max_group_size: u8, max_group_size: u8,

View File

@@ -78,7 +78,6 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use std::iter::FromIterator; use std::iter::FromIterator;
use charabia::normalizer::{Normalize, NormalizerOption}; use charabia::normalizer::{Normalize, NormalizerOption};
@@ -95,7 +94,7 @@ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValu
use crate::heed_codec::ByteSliceRefCodec; use crate::heed_codec::ByteSliceRefCodec;
use crate::update::index_documents::create_sorter; use crate::update::index_documents::create_sorter;
use crate::update::merge_btreeset_string; use crate::update::merge_btreeset_string;
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH}; use crate::{BEU16StrCodec, Index, Result, BEU16};
pub mod bulk; pub mod bulk;
pub mod delete; pub mod delete;
@@ -109,17 +108,13 @@ pub struct FacetsUpdate<'i> {
index: &'i Index, index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
facet_type: FacetType, facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>, new_data: grenad::Reader<File>,
group_size: u8, group_size: u8,
max_group_size: u8, max_group_size: u8,
min_level_size: u8, min_level_size: u8,
} }
impl<'i> FacetsUpdate<'i> { impl<'i> FacetsUpdate<'i> {
pub fn new( pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
index: &'i Index,
facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>,
) -> Self {
let database = match facet_type { let database = match facet_type {
FacetType::String => index FacetType::String => index
.facet_id_string_docids .facet_id_string_docids
@@ -196,16 +191,7 @@ impl<'i> FacetsUpdate<'i> {
for result in database.iter(wtxn)? { for result in database.iter(wtxn)? {
let (facet_group_key, ()) = result?; let (facet_group_key, ()) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key { if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
let mut normalized_facet = left_bound.normalize(&options); let normalized_facet = left_bound.normalize(&options);
let normalized_truncated_facet: String;
if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
normalized_truncated_facet = normalized_facet
.char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
normalized_facet = normalized_truncated_facet.into();
}
let set = BTreeSet::from_iter(std::iter::once(left_bound)); let set = BTreeSet::from_iter(std::iter::once(left_bound));
let key = (field_id, normalized_facet.as_ref()); let key = (field_id, normalized_facet.as_ref());
let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?; let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;

View File

@@ -1,4 +1,4 @@
use std::io::{BufWriter, Read, Seek}; use std::io::{Read, Seek};
use std::result::Result as StdResult; use std::result::Result as StdResult;
use std::{fmt, iter}; use std::{fmt, iter};
@@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?; let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
// The primary key *field id* that has already been set for this index or the one // The primary key *field id* that has already been set for this index or the one

View File

@@ -1,7 +1,6 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use std::{io, mem, str}; use std::{io, mem, str};
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
@@ -29,10 +28,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
indexer: GrenadParameters, indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>, searchable_fields: &Option<HashSet<FieldId>>,
stop_words: Option<&fst::Set<&[u8]>>, stop_words: Option<&fst::Set<&[u8]>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { ) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
puffin::profile_function!(); puffin::profile_function!();
let max_positions_per_attributes = max_positions_per_attributes let max_positions_per_attributes = max_positions_per_attributes
@@ -55,12 +52,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
if let Some(stop_words) = stop_words { if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words); tokenizer_builder.stop_words(stop_words);
} }
if let Some(dictionary) = dictionary {
tokenizer_builder.words_dict(dictionary);
}
if let Some(separators) = allowed_separators {
tokenizer_builder.separators(separators);
}
let tokenizer = tokenizer_builder.build(); let tokenizer = tokenizer_builder.build();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
@@ -227,9 +218,9 @@ fn process_tokens<'a>(
) -> impl Iterator<Item = (usize, Token<'a>)> { ) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens tokens
.skip_while(|token| token.is_separator()) .skip_while(|token| token.is_separator())
.scan((0, None), |(offset, prev_kind), mut token| { .scan((0, None), |(offset, prev_kind), token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind { *offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1, Some(_) => 1,
@@ -245,7 +236,7 @@ fn process_tokens<'a>(
{ {
*prev_kind = Some(token.kind); *prev_kind = Some(token.kind);
} }
_ => token.kind = TokenKind::Unknown, _ => (),
} }
Some((*offset, token)) Some((*offset, token))
}) })

View File

@@ -1,5 +1,5 @@
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use heed::{BytesDecode, BytesEncode}; use heed::{BytesDecode, BytesEncode};
@@ -19,7 +19,7 @@ use crate::Result;
pub fn extract_facet_number_docids<R: io::Read + io::Seek>( pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
docid_fid_facet_number: grenad::Reader<R>, docid_fid_facet_number: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();

View File

@@ -1,5 +1,5 @@
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use heed::BytesEncode; use heed::BytesEncode;
@@ -17,7 +17,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
pub fn extract_facet_string_docids<R: io::Read + io::Seek>( pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>, docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
@@ -46,7 +46,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
if normalised_value.len() > MAX_FACET_VALUE_LENGTH { if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
normalised_truncated_value = normalised_value normalised_truncated_value = normalised_value
.char_indices() .char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c) .map(|(_, c)| c)
.collect(); .collect();
normalised_value = normalised_truncated_value.as_str(); normalised_value = normalised_truncated_value.as_str();

View File

@@ -1,7 +1,7 @@
use std::collections::{BTreeMap, HashSet}; use std::collections::{BTreeMap, HashSet};
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use std::mem::size_of; use std::mem::size_of;
use heed::zerocopy::AsBytes; use heed::zerocopy::AsBytes;
@@ -17,24 +17,22 @@ use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET
/// The extracted facet values stored in grenad files by type. /// The extracted facet values stored in grenad files by type.
pub struct ExtractedFacetValues { pub struct ExtractedFacetValues {
pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>, pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>, pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>, pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>, pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>, pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
} }
/// Extracts the facet values of each faceted field of each document. /// Extracts the facet values of each faceted field of each document.
/// ///
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key /// Returns the generated grenad reader containing the docid the fid and the orginal value as key
/// and the normalized value as value extracted from the given chunk of documents. /// and the normalized value as value extracted from the given chunk of documents.
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
#[logging_timer::time] #[logging_timer::time]
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>, faceted_fields: &HashSet<FieldId>,
geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> { ) -> Result<ExtractedFacetValues> {
puffin::profile_function!(); puffin::profile_function!();
@@ -86,10 +84,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
match extract_facet_values( match extract_facet_values(&value) {
&value,
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
) {
FilterableValues::Null => { FilterableValues::Null => {
facet_is_null_docids.entry(field_id).or_default().insert(document); facet_is_null_docids.entry(field_id).or_default().insert(document);
} }
@@ -182,13 +177,12 @@ enum FilterableValues {
Values { numbers: Vec<f64>, strings: Vec<(String, String)> }, Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
} }
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { fn extract_facet_values(value: &Value) -> FilterableValues {
fn inner_extract_facet_values( fn inner_extract_facet_values(
value: &Value, value: &Value,
can_recurse: bool, can_recurse: bool,
output_numbers: &mut Vec<f64>, output_numbers: &mut Vec<f64>,
output_strings: &mut Vec<(String, String)>, output_strings: &mut Vec<(String, String)>,
geo_field: bool,
) { ) {
match value { match value {
Value::Null => (), Value::Null => (),
@@ -199,30 +193,13 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
} }
} }
Value::String(original) => { Value::String(original) => {
// if we're working on a geofield it MUST be something we can parse or else there was an internal error
// in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
if geo_field {
if let Ok(float) = original.parse() {
output_numbers.push(float);
} else {
log::warn!(
"Internal error, could not parse a geofield that has been validated. Please open an issue."
)
}
}
let normalized = crate::normalize_facet(original); let normalized = crate::normalize_facet(original);
output_strings.push((normalized, original.clone())); output_strings.push((normalized, original.clone()));
} }
Value::Array(values) => { Value::Array(values) => {
if can_recurse { if can_recurse {
for value in values { for value in values {
inner_extract_facet_values( inner_extract_facet_values(value, false, output_numbers, output_strings);
value,
false,
output_numbers,
output_strings,
geo_field,
);
} }
} }
} }
@@ -238,7 +215,7 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
otherwise => { otherwise => {
let mut numbers = Vec::new(); let mut numbers = Vec::new();
let mut strings = Vec::new(); let mut strings = Vec::new();
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field); inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
FilterableValues::Values { numbers, strings } FilterableValues::Values { numbers, strings }
} }
} }

View File

@@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use grenad::Sorter; use grenad::Sorter;
@@ -21,7 +21,7 @@ use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>( pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();

View File

@@ -1,5 +1,5 @@
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use concat_arrays::concat_arrays; use concat_arrays::concat_arrays;
use serde_json::Value; use serde_json::Value;
@@ -18,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
indexer: GrenadParameters, indexer: GrenadParameters,
primary_key_id: FieldId, primary_key_id: FieldId,
(lat_fid, lng_fid): (FieldId, FieldId), (lat_fid, lng_fid): (FieldId, FieldId),
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let mut writer = create_writer( let mut writer = create_writer(

View File

@@ -1,6 +1,6 @@
use std::convert::TryFrom; use std::convert::TryFrom;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use bytemuck::cast_slice; use bytemuck::cast_slice;
use serde_json::{from_slice, Value}; use serde_json::{from_slice, Value};
@@ -18,7 +18,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
indexer: GrenadParameters, indexer: GrenadParameters,
primary_key_id: FieldId, primary_key_id: FieldId,
vectors_fid: FieldId, vectors_fid: FieldId,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let mut writer = create_writer( let mut writer = create_writer(
@@ -35,7 +35,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
// lazily get it when needed // lazily get it when needed
let document_id = || -> Value { let document_id = || -> Value {
let document_id = obkv.get(primary_key_id).unwrap(); let document_id = obkv.get(primary_key_id).unwrap();
from_slice(document_id).unwrap() serde_json::from_slice(document_id).unwrap()
}; };
// first we retrieve the _vectors field // first we retrieve the _vectors field
@@ -52,7 +52,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
} }
}; };
if let Some(vectors) = vectors {
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) { for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
let index = u16::try_from(i).unwrap(); let index = u16::try_from(i).unwrap();
let mut key = docid_bytes.to_vec(); let mut key = docid_bytes.to_vec();
@@ -61,7 +60,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
writer.insert(key, bytes)?; writer.insert(key, bytes)?;
} }
} }
}
// else => the `_vectors` object was `null`, there is nothing to do // else => the `_vectors` object was `null`, there is nothing to do
} }

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use std::iter::FromIterator; use std::iter::FromIterator;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@@ -26,7 +26,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
exact_attributes: &HashSet<FieldId>, exact_attributes: &HashSet<FieldId>,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { ) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
puffin::profile_function!(); puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();

View File

@@ -1,5 +1,5 @@
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use super::helpers::{ use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
@@ -14,7 +14,7 @@ use crate::{relative_from_absolute_position, DocumentId, Result};
pub fn extract_word_fid_docids<R: io::Read + io::Seek>( pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();

View File

@@ -1,7 +1,6 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap}; use std::collections::{BinaryHeap, HashMap};
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use std::{cmp, io, mem, str, vec}; use std::{cmp, io, mem, str, vec};
use super::helpers::{ use super::helpers::{
@@ -21,7 +20,7 @@ use crate::{DocumentId, Result};
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();

View File

@@ -1,5 +1,5 @@
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use super::helpers::{ use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
@@ -17,7 +17,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu
pub fn extract_word_position_docids<R: io::Read + io::Seek>( pub fn extract_word_position_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
puffin::profile_function!(); puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();

View File

@@ -12,7 +12,6 @@ mod extract_word_position_docids;
use std::collections::HashSet; use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::BufReader;
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
use log::debug; use log::debug;
@@ -40,8 +39,8 @@ use crate::{FieldId, Result};
/// Send data in grenad file over provided Sender. /// Send data in grenad file over provided Sender.
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub(crate) fn data_from_obkv_documents( pub(crate) fn data_from_obkv_documents(
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send, original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send, flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: Option<HashSet<FieldId>>, searchable_fields: Option<HashSet<FieldId>>,
@@ -50,8 +49,6 @@ pub(crate) fn data_from_obkv_documents(
geo_fields_ids: Option<(FieldId, FieldId)>, geo_fields_ids: Option<(FieldId, FieldId)>,
vectors_field_id: Option<FieldId>, vectors_field_id: Option<FieldId>,
stop_words: Option<fst::Set<&[u8]>>, stop_words: Option<fst::Set<&[u8]>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>, exact_attributes: HashSet<FieldId>,
) -> Result<()> { ) -> Result<()> {
@@ -60,13 +57,7 @@ pub(crate) fn data_from_obkv_documents(
original_obkv_chunks original_obkv_chunks
.par_bridge() .par_bridge()
.map(|original_documents_chunk| { .map(|original_documents_chunk| {
send_original_documents_data( send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
original_documents_chunk,
indexer,
lmdb_writer_sx.clone(),
vectors_field_id,
primary_key_id,
)
}) })
.collect::<Result<()>>()?; .collect::<Result<()>>()?;
@@ -83,9 +74,8 @@ pub(crate) fn data_from_obkv_documents(
&faceted_fields, &faceted_fields,
primary_key_id, primary_key_id,
geo_fields_ids, geo_fields_ids,
vectors_field_id,
&stop_words, &stop_words,
&allowed_separators,
&dictionary,
max_positions_per_attributes, max_positions_per_attributes,
) )
}) })
@@ -153,7 +143,7 @@ pub(crate) fn data_from_obkv_documents(
}); });
} }
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@@ -163,7 +153,7 @@ pub(crate) fn data_from_obkv_documents(
"word-pair-proximity-docids", "word-pair-proximity-docids",
); );
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@@ -173,11 +163,7 @@ pub(crate) fn data_from_obkv_documents(
"field-id-wordcount-docids", "field-id-wordcount-docids",
); );
spawn_extraction_task::< spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
_,
_,
Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@@ -190,7 +176,7 @@ pub(crate) fn data_from_obkv_documents(
"word-docids", "word-docids",
); );
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@@ -199,7 +185,7 @@ pub(crate) fn data_from_obkv_documents(
TypedChunk::WordPositionDocids, TypedChunk::WordPositionDocids,
"word-position-docids", "word-position-docids",
); );
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks, docid_word_positions_chunks,
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@@ -209,7 +195,7 @@ pub(crate) fn data_from_obkv_documents(
"word-fid-docids", "word-fid-docids",
); );
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_fid_facet_strings_chunks, docid_fid_facet_strings_chunks,
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
@@ -219,7 +205,7 @@ pub(crate) fn data_from_obkv_documents(
"field-id-facet-string-docids", "field-id-facet-string-docids",
); );
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_fid_facet_numbers_chunks, docid_fid_facet_numbers_chunks,
indexer, indexer,
lmdb_writer_sx, lmdb_writer_sx,
@@ -274,34 +260,12 @@ fn spawn_extraction_task<FE, FS, M>(
/// Extract chunked data and send it into lmdb_writer_sx sender: /// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents /// - documents
fn send_original_documents_data( fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>, original_documents_chunk: Result<grenad::Reader<File>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
vectors_field_id: Option<FieldId>,
primary_key_id: FieldId,
) -> Result<()> { ) -> Result<()> {
let original_documents_chunk = let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
if let Some(vectors_field_id) = vectors_field_id {
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
rayon::spawn(move || {
let result = extract_vector_points(
documents_chunk_cloned,
indexer,
primary_key_id,
vectors_field_id,
);
let _ = match result {
Ok(vector_points) => {
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
}
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
};
});
}
// TODO: create a custom internal error // TODO: create a custom internal error
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
Ok(()) Ok(())
@@ -316,16 +280,15 @@ fn send_original_documents_data(
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
fn send_and_extract_flattened_documents_data( fn send_and_extract_flattened_documents_data(
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>, flattened_documents_chunk: Result<grenad::Reader<File>>,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: &Option<HashSet<FieldId>>, searchable_fields: &Option<HashSet<FieldId>>,
faceted_fields: &HashSet<FieldId>, faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>, geo_fields_ids: Option<(FieldId, FieldId)>,
vectors_field_id: Option<FieldId>,
stop_words: &Option<fst::Set<&[u8]>>, stop_words: &Option<fst::Set<&[u8]>>,
allowed_separators: &Option<&[&str]>,
dictionary: &Option<&[&str]>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<( ) -> Result<(
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
@@ -333,10 +296,7 @@ fn send_and_extract_flattened_documents_data(
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
( (
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
( (grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
grenad::Reader<BufReader<File>>,
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
),
), ),
), ),
)> { )> {
@@ -356,6 +316,25 @@ fn send_and_extract_flattened_documents_data(
}); });
} }
if let Some(vectors_field_id) = vectors_field_id {
let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
rayon::spawn(move || {
let result = extract_vector_points(
documents_chunk_cloned,
indexer,
primary_key_id,
vectors_field_id,
);
let _ = match result {
Ok(vector_points) => {
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
}
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
};
});
}
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join( rayon::join(
|| { || {
@@ -365,8 +344,6 @@ fn send_and_extract_flattened_documents_data(
indexer, indexer,
searchable_fields, searchable_fields,
stop_words.as_ref(), stop_words.as_ref(),
*allowed_separators,
*dictionary,
max_positions_per_attributes, max_positions_per_attributes,
)?; )?;
@@ -393,7 +370,6 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
indexer, indexer,
faceted_fields, faceted_fields,
geo_fields_ids,
)?; )?;
// send docid_fid_facet_numbers_chunk to DB writer // send docid_fid_facet_numbers_chunk to DB writer

View File

@@ -1,6 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader, BufWriter, Seek}; use std::io::{self, Seek};
use std::time::Instant; use std::time::Instant;
use grenad::{CompressionType, Sorter}; use grenad::{CompressionType, Sorter};
@@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
typ: grenad::CompressionType, typ: grenad::CompressionType,
level: Option<u32>, level: Option<u32>,
file: R, file: R,
) -> grenad::Writer<BufWriter<R>> { ) -> grenad::Writer<R> {
let mut builder = grenad::Writer::builder(); let mut builder = grenad::Writer::builder();
builder.compression_type(typ); builder.compression_type(typ);
if let Some(level) = level { if let Some(level) = level {
builder.compression_level(level); builder.compression_level(level);
} }
builder.build(BufWriter::new(file)) builder.build(file)
} }
pub fn create_sorter( pub fn create_sorter(
@@ -53,7 +53,7 @@ pub fn create_sorter(
pub fn sorter_into_reader( pub fn sorter_into_reader(
sorter: grenad::Sorter<MergeFn>, sorter: grenad::Sorter<MergeFn>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<File>> {
let mut writer = create_writer( let mut writer = create_writer(
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
@@ -64,18 +64,16 @@ pub fn sorter_into_reader(
writer_into_reader(writer) writer_into_reader(writer)
} }
pub fn writer_into_reader( pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
writer: grenad::Writer<BufWriter<File>>, let mut file = writer.into_inner()?;
) -> Result<grenad::Reader<BufReader<File>>> {
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
file.rewind()?; file.rewind()?;
grenad::Reader::new(BufReader::new(file)).map_err(Into::into) grenad::Reader::new(file).map_err(Into::into)
} }
pub unsafe fn as_cloneable_grenad( pub unsafe fn as_cloneable_grenad(
reader: &grenad::Reader<BufReader<File>>, reader: &grenad::Reader<File>,
) -> Result<grenad::Reader<CursorClonableMmap>> { ) -> Result<grenad::Reader<CursorClonableMmap>> {
let file = reader.get_ref().get_ref(); let file = reader.get_ref();
let mmap = memmap2::Mmap::map(file)?; let mmap = memmap2::Mmap::map(file)?;
let cursor = io::Cursor::new(ClonableMmap::from(mmap)); let cursor = io::Cursor::new(ClonableMmap::from(mmap));
let reader = grenad::Reader::new(cursor)?; let reader = grenad::Reader::new(cursor)?;
@@ -91,8 +89,8 @@ where
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>; fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
} }
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> { impl MergeableReader for Vec<grenad::Reader<File>> {
type Output = grenad::Reader<BufReader<File>>; type Output = grenad::Reader<File>;
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> { fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut merger = MergerBuilder::new(merge_fn); let mut merger = MergerBuilder::new(merge_fn);
@@ -101,8 +99,8 @@ impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
} }
} }
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>); type Output = (grenad::Reader<File>, grenad::Reader<File>);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> { fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn); let mut m1 = MergerBuilder::new(merge_fn);
@@ -127,7 +125,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
Ok(()) Ok(())
} }
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> { fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
let merger = self.0.build(); let merger = self.0.build();
let mut writer = create_writer( let mut writer = create_writer(
params.chunk_compression_type, params.chunk_compression_type,
@@ -178,7 +176,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
reader: grenad::Reader<R>, reader: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
documents_chunk_size: usize, documents_chunk_size: usize,
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> { ) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
let mut continue_reading = true; let mut continue_reading = true;
let mut cursor = reader.into_cursor()?; let mut cursor = reader.into_cursor()?;

View File

@@ -316,12 +316,6 @@ where
let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors"); let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors");
let stop_words = self.index.stop_words(self.wtxn)?; let stop_words = self.index.stop_words(self.wtxn)?;
let separators = self.index.allowed_separators(self.wtxn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
let dictionary = self.index.dictionary(self.wtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let pool_params = GrenadParameters { let pool_params = GrenadParameters {
@@ -359,8 +353,6 @@ where
geo_fields_ids, geo_fields_ids,
vectors_field_id, vectors_field_id,
stop_words, stop_words,
separators.as_deref(),
dictionary.as_deref(),
max_positions_per_attributes, max_positions_per_attributes,
exact_attributes, exact_attributes,
) )
@@ -2550,25 +2542,6 @@ mod tests {
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
} }
/// Index multiple different number of vectors in documents.
/// Vectors must be of the same length.
#[test]
fn test_multiple_vectors() {
let index = TempIndex::new();
index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
index
.add_documents(
documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
)
.unwrap();
let rtxn = index.read_txn().unwrap();
let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
assert_eq!(res.documents_ids.len(), 3);
}
#[test] #[test]
fn reproduce_the_bug() { fn reproduce_the_bug() {
/* /*

View File

@@ -659,10 +659,8 @@ impl<'a, 'i> Transform<'a, 'i> {
new_documents_ids: self.new_documents_ids, new_documents_ids: self.new_documents_ids,
replaced_documents_ids: self.replaced_documents_ids, replaced_documents_ids: self.replaced_documents_ids,
documents_count: self.documents_count, documents_count: self.documents_count,
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, original_documents,
flattened_documents: flattened_documents flattened_documents,
.into_inner()
.map_err(|err| err.into_error())?,
}) })
} }
@@ -781,10 +779,8 @@ impl<'a, 'i> Transform<'a, 'i> {
new_documents_ids: documents_ids, new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(), replaced_documents_ids: RoaringBitmap::default(),
documents_count, documents_count,
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, original_documents,
flattened_documents: flattened_documents flattened_documents,
.into_inner()
.map_err(|err| err.into_error())?,
}; };
let new_facets = output.compute_real_facets(wtxn, self.index)?; let new_facets = output.compute_real_facets(wtxn, self.index)?;

View File

@@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io;
use bytemuck::allocation::pod_collect_to_vec; use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script}; use charabia::{Language, Script};
@@ -27,22 +27,22 @@ pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>), FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>), FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
Documents(grenad::Reader<CursorClonableMmap>), Documents(grenad::Reader<CursorClonableMmap>),
FieldIdWordcountDocids(grenad::Reader<BufReader<File>>), FieldIdWordcountDocids(grenad::Reader<File>),
NewDocumentsIds(RoaringBitmap), NewDocumentsIds(RoaringBitmap),
WordDocids { WordDocids {
word_docids_reader: grenad::Reader<BufReader<File>>, word_docids_reader: grenad::Reader<File>,
exact_word_docids_reader: grenad::Reader<BufReader<File>>, exact_word_docids_reader: grenad::Reader<File>,
}, },
WordPositionDocids(grenad::Reader<BufReader<File>>), WordPositionDocids(grenad::Reader<File>),
WordFidDocids(grenad::Reader<BufReader<File>>), WordFidDocids(grenad::Reader<File>),
WordPairProximityDocids(grenad::Reader<BufReader<File>>), WordPairProximityDocids(grenad::Reader<File>),
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>), FieldIdFacetStringDocids(grenad::Reader<File>),
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>), FieldIdFacetNumberDocids(grenad::Reader<File>),
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>), FieldIdFacetExistsDocids(grenad::Reader<File>),
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>), FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>), FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<BufReader<File>>), GeoPoints(grenad::Reader<File>),
VectorPoints(grenad::Reader<BufReader<File>>), VectorPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
} }

View File

@@ -1,6 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashSet; use std::collections::HashSet;
use std::io::{BufReader, BufWriter}; use std::io::BufReader;
use grenad::CompressionType; use grenad::CompressionType;
use heed::types::ByteSlice; use heed::types::ByteSlice;
@@ -119,9 +119,9 @@ pub fn insert_into_database(
pub fn write_into_lmdb_database_without_merging( pub fn write_into_lmdb_database_without_merging(
wtxn: &mut heed::RwTxn, wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase, database: heed::PolyDatabase,
writer: grenad::Writer<BufWriter<std::fs::File>>, writer: grenad::Writer<std::fs::File>,
) -> Result<()> { ) -> Result<()> {
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; let file = writer.into_inner()?;
let reader = grenad::Reader::new(BufReader::new(file))?; let reader = grenad::Reader::new(BufReader::new(file))?;
if database.is_empty(wtxn)? { if database.is_empty(wtxn)? {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;

View File

@@ -1,4 +1,4 @@
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use std::result::Result as StdResult; use std::result::Result as StdResult;
use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use charabia::{Normalize, Tokenizer, TokenizerBuilder};
@@ -112,11 +112,8 @@ pub struct Settings<'a, 't, 'u, 'i> {
sortable_fields: Setting<HashSet<String>>, sortable_fields: Setting<HashSet<String>>,
criteria: Setting<Vec<Criterion>>, criteria: Setting<Vec<Criterion>>,
stop_words: Setting<BTreeSet<String>>, stop_words: Setting<BTreeSet<String>>,
non_separator_tokens: Setting<BTreeSet<String>>,
separator_tokens: Setting<BTreeSet<String>>,
dictionary: Setting<BTreeSet<String>>,
distinct_field: Setting<String>, distinct_field: Setting<String>,
synonyms: Setting<BTreeMap<String, Vec<String>>>, synonyms: Setting<HashMap<String, Vec<String>>>,
primary_key: Setting<String>, primary_key: Setting<String>,
authorize_typos: Setting<bool>, authorize_typos: Setting<bool>,
min_word_len_two_typos: Setting<u8>, min_word_len_two_typos: Setting<u8>,
@@ -144,9 +141,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
sortable_fields: Setting::NotSet, sortable_fields: Setting::NotSet,
criteria: Setting::NotSet, criteria: Setting::NotSet,
stop_words: Setting::NotSet, stop_words: Setting::NotSet,
non_separator_tokens: Setting::NotSet,
separator_tokens: Setting::NotSet,
dictionary: Setting::NotSet,
distinct_field: Setting::NotSet, distinct_field: Setting::NotSet,
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
primary_key: Setting::NotSet, primary_key: Setting::NotSet,
@@ -211,39 +205,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) } if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) }
} }
pub fn reset_non_separator_tokens(&mut self) {
self.non_separator_tokens = Setting::Reset;
}
pub fn set_non_separator_tokens(&mut self, non_separator_tokens: BTreeSet<String>) {
self.non_separator_tokens = if non_separator_tokens.is_empty() {
Setting::Reset
} else {
Setting::Set(non_separator_tokens)
}
}
pub fn reset_separator_tokens(&mut self) {
self.separator_tokens = Setting::Reset;
}
pub fn set_separator_tokens(&mut self, separator_tokens: BTreeSet<String>) {
self.separator_tokens = if separator_tokens.is_empty() {
Setting::Reset
} else {
Setting::Set(separator_tokens)
}
}
pub fn reset_dictionary(&mut self) {
self.dictionary = Setting::Reset;
}
pub fn set_dictionary(&mut self, dictionary: BTreeSet<String>) {
self.dictionary =
if dictionary.is_empty() { Setting::Reset } else { Setting::Set(dictionary) }
}
pub fn reset_distinct_field(&mut self) { pub fn reset_distinct_field(&mut self) {
self.distinct_field = Setting::Reset; self.distinct_field = Setting::Reset;
} }
@@ -256,7 +217,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.synonyms = Setting::Reset; self.synonyms = Setting::Reset;
} }
pub fn set_synonyms(&mut self, synonyms: BTreeMap<String, Vec<String>>) { pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) } self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
} }
@@ -491,89 +452,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} }
} }
fn update_non_separator_tokens(&mut self) -> Result<bool> {
let changes = match self.non_separator_tokens {
Setting::Set(ref non_separator_tokens) => {
let current = self.index.non_separator_tokens(self.wtxn)?;
// Does the new list differ from the previous one?
if current.map_or(true, |current| &current != non_separator_tokens) {
self.index.put_non_separator_tokens(self.wtxn, non_separator_tokens)?;
true
} else {
false
}
}
Setting::Reset => self.index.delete_non_separator_tokens(self.wtxn)?,
Setting::NotSet => false,
};
// the synonyms must be updated if non separator tokens have been updated.
if changes && self.synonyms == Setting::NotSet {
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
}
Ok(changes)
}
fn update_separator_tokens(&mut self) -> Result<bool> {
let changes = match self.separator_tokens {
Setting::Set(ref separator_tokens) => {
let current = self.index.separator_tokens(self.wtxn)?;
// Does the new list differ from the previous one?
if current.map_or(true, |current| &current != separator_tokens) {
self.index.put_separator_tokens(self.wtxn, separator_tokens)?;
true
} else {
false
}
}
Setting::Reset => self.index.delete_separator_tokens(self.wtxn)?,
Setting::NotSet => false,
};
// the synonyms must be updated if separator tokens have been updated.
if changes && self.synonyms == Setting::NotSet {
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
}
Ok(changes)
}
fn update_dictionary(&mut self) -> Result<bool> {
let changes = match self.dictionary {
Setting::Set(ref dictionary) => {
let current = self.index.dictionary(self.wtxn)?;
// Does the new list differ from the previous one?
if current.map_or(true, |current| &current != dictionary) {
self.index.put_dictionary(self.wtxn, dictionary)?;
true
} else {
false
}
}
Setting::Reset => self.index.delete_dictionary(self.wtxn)?,
Setting::NotSet => false,
};
// the synonyms must be updated if dictionary has been updated.
if changes && self.synonyms == Setting::NotSet {
self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
}
Ok(changes)
}
fn update_synonyms(&mut self) -> Result<bool> { fn update_synonyms(&mut self) -> Result<bool> {
match self.synonyms { match self.synonyms {
Setting::Set(ref user_synonyms) => { Setting::Set(ref synonyms) => {
fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> { fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
tokenizer tokenizer
.tokenize(text) .tokenize(text)
.filter_map(|token| { .filter_map(|token| {
if token.is_word() && !token.lemma().is_empty() { if token.is_word() {
Some(token.lemma().to_string()) Some(token.lemma().to_string())
} else { } else {
None None
@@ -587,39 +473,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
if let Some(ref stop_words) = stop_words { if let Some(ref stop_words) = stop_words {
builder.stop_words(stop_words); builder.stop_words(stop_words);
} }
let separators = self.index.allowed_separators(self.wtxn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref separators) = separators {
builder.separators(separators);
}
let dictionary = self.index.dictionary(self.wtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref dictionary) = dictionary {
builder.words_dict(dictionary);
}
let tokenizer = builder.build(); let tokenizer = builder.build();
let mut new_synonyms = HashMap::new(); let mut new_synonyms = HashMap::new();
for (word, synonyms) in user_synonyms { for (word, synonyms) in synonyms {
// Normalize both the word and associated synonyms. // Normalize both the word and associated synonyms.
let normalized_word = normalize(&tokenizer, word); let normalized_word = normalize(&tokenizer, word);
let normalized_synonyms: Vec<_> = synonyms let normalized_synonyms =
.iter() synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
.map(|synonym| normalize(&tokenizer, synonym))
.filter(|synonym| !synonym.is_empty())
.collect();
// Store the normalized synonyms under the normalized word, // Store the normalized synonyms under the normalized word,
// merging the possible duplicate words. // merging the possible duplicate words.
if !normalized_word.is_empty() && !normalized_synonyms.is_empty() {
let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new); let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
entry.extend(normalized_synonyms.into_iter()); entry.extend(normalized_synonyms);
}
} }
// Make sure that we don't have duplicate synonyms. // Make sure that we don't have duplicate synonyms.
@@ -631,7 +497,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let old_synonyms = self.index.synonyms(self.wtxn)?; let old_synonyms = self.index.synonyms(self.wtxn)?;
if new_synonyms != old_synonyms { if new_synonyms != old_synonyms {
self.index.put_synonyms(self.wtxn, &new_synonyms, user_synonyms)?; self.index.put_synonyms(self.wtxn, &new_synonyms)?;
Ok(true) Ok(true)
} else { } else {
Ok(false) Ok(false)
@@ -891,17 +757,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let faceted_updated = old_faceted_fields != new_faceted_fields; let faceted_updated = old_faceted_fields != new_faceted_fields;
let stop_words_updated = self.update_stop_words()?; let stop_words_updated = self.update_stop_words()?;
let non_separator_tokens_updated = self.update_non_separator_tokens()?;
let separator_tokens_updated = self.update_separator_tokens()?;
let dictionary_updated = self.update_dictionary()?;
let synonyms_updated = self.update_synonyms()?; let synonyms_updated = self.update_synonyms()?;
let searchable_updated = self.update_searchable()?; let searchable_updated = self.update_searchable()?;
let exact_attributes_updated = self.update_exact_attributes()?; let exact_attributes_updated = self.update_exact_attributes()?;
if stop_words_updated if stop_words_updated
|| non_separator_tokens_updated
|| separator_tokens_updated
|| dictionary_updated
|| faceted_updated || faceted_updated
|| synonyms_updated || synonyms_updated
|| searchable_updated || searchable_updated
@@ -918,7 +778,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
mod tests { mod tests {
use big_s::S; use big_s::S;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use maplit::{btreemap, btreeset, hashset}; use maplit::{btreeset, hashmap, hashset};
use super::*; use super::*;
use crate::error::Error; use crate::error::Error;
@@ -1384,7 +1244,7 @@ mod tests {
// In the same transaction provide some synonyms // In the same transaction provide some synonyms
index index
.update_settings_using_wtxn(&mut wtxn, |settings| { .update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_synonyms(btreemap! { settings.set_synonyms(hashmap! {
"blini".to_string() => vec!["crepes".to_string()], "blini".to_string() => vec!["crepes".to_string()],
"super like".to_string() => vec!["love".to_string()], "super like".to_string() => vec!["love".to_string()],
"puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
@@ -1427,43 +1287,6 @@ mod tests {
assert!(result.documents_ids.is_empty()); assert!(result.documents_ids.is_empty());
} }
#[test]
fn thai_synonyms() {
let mut index = TempIndex::new();
index.index_documents_config.autogenerate_docids = true;
let mut wtxn = index.write_txn().unwrap();
// Send 3 documents with ids from 1 to 3.
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "name": "ยี่ปุ่น" },
{ "name": "ญี่ปุ่น" },
]),
)
.unwrap();
// In the same transaction provide some synonyms
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_synonyms(btreemap! {
"japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")],
});
})
.unwrap();
wtxn.commit().unwrap();
// Ensure synonyms are effectively stored
let rtxn = index.read_txn().unwrap();
let synonyms = index.synonyms(&rtxn).unwrap();
assert!(!synonyms.is_empty()); // at this point the index should return something
// Check that we can use synonyms
let result = index.search(&rtxn).query("japanese").execute().unwrap();
assert_eq!(result.documents_ids.len(), 2);
}
#[test] #[test]
fn setting_searchable_recomputes_other_settings() { fn setting_searchable_recomputes_other_settings() {
let index = TempIndex::new(); let index = TempIndex::new();
@@ -1717,9 +1540,6 @@ mod tests {
sortable_fields, sortable_fields,
criteria, criteria,
stop_words, stop_words,
non_separator_tokens,
separator_tokens,
dictionary,
distinct_field, distinct_field,
synonyms, synonyms,
primary_key, primary_key,
@@ -1738,9 +1558,6 @@ mod tests {
assert!(matches!(sortable_fields, Setting::NotSet)); assert!(matches!(sortable_fields, Setting::NotSet));
assert!(matches!(criteria, Setting::NotSet)); assert!(matches!(criteria, Setting::NotSet));
assert!(matches!(stop_words, Setting::NotSet)); assert!(matches!(stop_words, Setting::NotSet));
assert!(matches!(non_separator_tokens, Setting::NotSet));
assert!(matches!(separator_tokens, Setting::NotSet));
assert!(matches!(dictionary, Setting::NotSet));
assert!(matches!(distinct_field, Setting::NotSet)); assert!(matches!(distinct_field, Setting::NotSet));
assert!(matches!(synonyms, Setting::NotSet)); assert!(matches!(synonyms, Setting::NotSet));
assert!(matches!(primary_key, Setting::NotSet)); assert!(matches!(primary_key, Setting::NotSet));

View File

@@ -8,7 +8,7 @@ use Criterion::*;
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
macro_rules! test_distinct { macro_rules! test_distinct {
($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $offset:expr, $criteria:expr, $n_res:expr) => { ($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $criteria:expr, $n_res:expr) => {
#[test] #[test]
fn $func() { fn $func() {
let criteria = $criteria; let criteria = $criteria;
@@ -27,7 +27,6 @@ macro_rules! test_distinct {
let mut search = Search::new(&rtxn, &index); let mut search = Search::new(&rtxn, &index);
search.query(search::TEST_QUERY); search.query(search::TEST_QUERY);
search.limit($limit); search.limit($limit);
search.offset($offset);
search.exhaustive_number_hits($exhaustive); search.exhaustive_number_hits($exhaustive);
search.terms_matching_strategy(TermsMatchingStrategy::default()); search.terms_matching_strategy(TermsMatchingStrategy::default());
@@ -48,7 +47,6 @@ macro_rules! test_distinct {
Some(d.id) Some(d.id)
} }
}) })
.skip($offset)
.take($limit) .take($limit)
.collect(); .collect();
@@ -63,7 +61,6 @@ test_distinct!(
tag, tag,
true, true,
1, 1,
0,
vec![Words, Typo, Proximity, Attribute, Exactness], vec![Words, Typo, Proximity, Attribute, Exactness],
3 3
); );
@@ -72,7 +69,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
true, true,
1, 1,
0,
vec![Words, Typo, Proximity, Attribute, Exactness], vec![Words, Typo, Proximity, Attribute, Exactness],
7 7
); );
@@ -81,7 +77,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
true, true,
0, 0,
0,
vec![Desc(S("attribute_rank")), Desc(S("exactness_rank")), Exactness, Typo], vec![Desc(S("attribute_rank")), Desc(S("exactness_rank")), Exactness, Typo],
7 7
); );
@@ -91,7 +86,6 @@ test_distinct!(
tag, tag,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Typo, Proximity, Attribute, Exactness], vec![Words, Typo, Proximity, Attribute, Exactness],
3 3
); );
@@ -100,7 +94,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Typo, Proximity, Attribute, Exactness], vec![Words, Typo, Proximity, Attribute, Exactness],
7 7
); );
@@ -109,7 +102,6 @@ test_distinct!(
tag, tag,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words], vec![Words],
3 3
); );
@@ -118,7 +110,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words], vec![Words],
7 7
); );
@@ -127,7 +118,6 @@ test_distinct!(
tag, tag,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Typo], vec![Words, Typo],
3 3
); );
@@ -136,7 +126,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Typo], vec![Words, Typo],
7 7
); );
@@ -145,7 +134,6 @@ test_distinct!(
tag, tag,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Proximity], vec![Words, Proximity],
3 3
); );
@@ -154,7 +142,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Proximity], vec![Words, Proximity],
7 7
); );
@@ -163,7 +150,6 @@ test_distinct!(
tag, tag,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Attribute], vec![Words, Attribute],
3 3
); );
@@ -172,7 +158,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Attribute], vec![Words, Attribute],
7 7
); );
@@ -181,7 +166,6 @@ test_distinct!(
tag, tag,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Exactness], vec![Words, Exactness],
3 3
); );
@@ -190,47 +174,6 @@ test_distinct!(
asc_desc_rank, asc_desc_rank,
false, false,
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
0,
vec![Words, Exactness], vec![Words, Exactness],
7 7
); );
test_distinct!(
// testing: https://github.com/meilisearch/meilisearch/issues/4078
distinct_string_limit_and_offset,
tag,
false,
EXTERNAL_DOCUMENTS_IDS.len(),
1,
vec![],
3
);
test_distinct!(
// testing: https://github.com/meilisearch/meilisearch/issues/4078
exhaustive_distinct_string_limit_and_offset,
tag,
true,
1,
2,
vec![],
3
);
test_distinct!(
// testing: https://github.com/meilisearch/meilisearch/issues/4078
distinct_number_limit_and_offset,
asc_desc_rank,
false,
EXTERNAL_DOCUMENTS_IDS.len(),
2,
vec![],
7
);
test_distinct!(
// testing: https://github.com/meilisearch/meilisearch/issues/4078
exhaustive_distinct_number_limit_and_offset,
asc_desc_rank,
true,
2,
4,
vec![],
7
);

View File

@@ -5,7 +5,7 @@ use std::io::Cursor;
use big_s::S; use big_s::S;
use either::{Either, Left, Right}; use either::{Either, Left, Right};
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::{btreemap, hashset}; use maplit::{hashmap, hashset};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy}; use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
@@ -51,7 +51,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
S("tag"), S("tag"),
S("asc_desc_rank"), S("asc_desc_rank"),
}); });
builder.set_synonyms(btreemap! { builder.set_synonyms(hashmap! {
S("hello") => vec![S("good morning")], S("hello") => vec![S("good morning")],
S("world") => vec![S("earth")], S("world") => vec![S("earth")],
S("america") => vec![S("the united states")], S("america") => vec![S("the united states")],

View File

@@ -186,16 +186,12 @@ fn create_value(value: &Document, mut selectors: HashSet<&str>) -> Document {
let array = create_array(array, &sub_selectors); let array = create_array(array, &sub_selectors);
if !array.is_empty() { if !array.is_empty() {
new_value.insert(key.to_string(), array.into()); new_value.insert(key.to_string(), array.into());
} else {
new_value.insert(key.to_string(), Value::Array(vec![]));
} }
} }
Value::Object(object) => { Value::Object(object) => {
let object = create_value(object, sub_selectors); let object = create_value(object, sub_selectors);
if !object.is_empty() { if !object.is_empty() {
new_value.insert(key.to_string(), object.into()); new_value.insert(key.to_string(), object.into());
} else {
new_value.insert(key.to_string(), Value::Object(Map::new()));
} }
} }
_ => (), _ => (),
@@ -215,8 +211,6 @@ fn create_array(array: &[Value], selectors: &HashSet<&str>) -> Vec<Value> {
let array = create_array(array, selectors); let array = create_array(array, selectors);
if !array.is_empty() { if !array.is_empty() {
res.push(array.into()); res.push(array.into());
} else {
res.push(Value::Array(vec![]));
} }
} }
Value::Object(object) => { Value::Object(object) => {
@@ -643,24 +637,6 @@ mod tests {
); );
} }
#[test]
fn empty_array_object_return_empty() {
let value: Value = json!({
"array": [],
"object": {},
});
let value: &Document = value.as_object().unwrap();
let res: Value = select_values(value, vec!["array.name", "object.name"]).into();
assert_eq!(
res,
json!({
"array": [],
"object": {},
})
);
}
#[test] #[test]
fn all_conflict_variation() { fn all_conflict_variation() {
let value: Value = json!({ let value: Value = json!({