mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-22 14:21:03 +00:00
Compare commits
22 Commits
Author | SHA1 | Date | |
---|---|---|---|
32d2cc3aea | |||
8a17fcdda5 | |||
9602d7a960 | |||
ac12a4b9c9 | |||
af96050944 | |||
a43b37dfc1 | |||
c08dcac1d4 | |||
a17dccd84e | |||
9a57cab3ee | |||
751b060320 | |||
4111b99a6d | |||
d6fb2b56d1 | |||
cb5c77e536 | |||
44c89b1ea2 | |||
26a285053b | |||
1446a6a2d2 | |||
047eba3ff3 | |||
8d9d183ce6 | |||
eb67195840 | |||
93306c2326 | |||
7d9cf8d713 | |||
03eb7898e7 |
@ -12,6 +12,7 @@ A _full-text search database_ based on the fast [LMDB key-value store](https://e
|
||||
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order
|
||||
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L283), useful for paginating results
|
||||
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L265-L270) and [filter](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L246-L259) returned documents based on context defined rules
|
||||
- Searches for [concatenated](https://github.com/meilisearch/MeiliDB/pull/164) and [splitted query words](https://github.com/meilisearch/MeiliDB/pull/232) to improve the search quality.
|
||||
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-schema/src/lib.rs#L265-L279)
|
||||
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages
|
||||
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results
|
||||
|
@ -40,7 +40,7 @@ struct IndexCommand {
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
struct SearchCommand {
|
||||
/// The destination where the database must be created.
|
||||
/// The path of the database to work with.
|
||||
#[structopt(parse(from_os_str))]
|
||||
database_path: PathBuf,
|
||||
|
||||
@ -65,10 +65,18 @@ struct SearchCommand {
|
||||
displayed_fields: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
struct ShowUpdatesCommand {
|
||||
/// The path of the database to work with.
|
||||
#[structopt(parse(from_os_str))]
|
||||
database_path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
enum Command {
|
||||
Index(IndexCommand),
|
||||
Search(SearchCommand),
|
||||
ShowUpdates(ShowUpdatesCommand),
|
||||
}
|
||||
|
||||
impl Command {
|
||||
@ -76,6 +84,7 @@ impl Command {
|
||||
match self {
|
||||
Command::Index(command) => &command.database_path,
|
||||
Command::Search(command) => &command.database_path,
|
||||
Command::ShowUpdates(command) => &command.database_path,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -303,6 +312,7 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
||||
let reader = env.read_txn().unwrap();
|
||||
let schema = index.main.schema(&reader)?;
|
||||
reader.abort();
|
||||
|
||||
let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?;
|
||||
|
||||
let fields = command.displayed_fields.iter().map(String::as_str);
|
||||
@ -418,6 +428,23 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn show_updates_command(
|
||||
_command: ShowUpdatesCommand,
|
||||
database: Database,
|
||||
) -> Result<(), Box<dyn Error>> {
|
||||
let env = &database.env;
|
||||
let index = database
|
||||
.open_index(INDEX_NAME)
|
||||
.expect("Could not find index");
|
||||
|
||||
let reader = env.read_txn().unwrap();
|
||||
let updates = index.all_updates_status(&reader)?;
|
||||
println!("{:#?}", updates);
|
||||
reader.abort();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
env_logger::init();
|
||||
|
||||
@ -427,5 +454,6 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||
match opt {
|
||||
Command::Index(command) => index_command(command, database),
|
||||
Command::Search(command) => search_command(command, database),
|
||||
Command::ShowUpdates(command) => show_updates_command(command, database),
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ mod dfa;
|
||||
mod query_enhancer;
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::vec;
|
||||
use std::{cmp, vec};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
@ -18,7 +18,7 @@ use self::query_enhancer::QueryEnhancerBuilder;
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
pub struct AutomatonProducer {
|
||||
automatons: Vec<Vec<Automaton>>,
|
||||
automatons: Vec<AutomatonGroup>,
|
||||
}
|
||||
|
||||
impl AutomatonProducer {
|
||||
@ -26,19 +26,47 @@ impl AutomatonProducer {
|
||||
reader: &heed::RoTxn,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_list_store: store::PostingsLists,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||
let (automatons, query_enhancer) =
|
||||
generate_automatons(reader, query, main_store, synonyms_store)?;
|
||||
let (automatons, query_enhancer) = generate_automatons(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_list_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||
}
|
||||
|
||||
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
|
||||
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
|
||||
self.automatons.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AutomatonGroup {
|
||||
pub is_phrase_query: bool,
|
||||
pub automatons: Vec<Automaton>,
|
||||
}
|
||||
|
||||
impl AutomatonGroup {
|
||||
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: false,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: true,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Automaton {
|
||||
pub index: usize,
|
||||
@ -102,12 +130,41 @@ pub fn normalize_str(string: &str) -> String {
|
||||
string
|
||||
}
|
||||
|
||||
fn split_best_frequency<'a>(
|
||||
reader: &heed::RoTxn,
|
||||
word: &'a str,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Option<(&'a str, &'a str)>> {
|
||||
let chars = word.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = word.split_at(i);
|
||||
|
||||
let left_freq = postings_lists_store
|
||||
.postings_list(reader, left.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let right_freq = postings_lists_store
|
||||
.postings_list(reader, right.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let min_freq = cmp::min(left_freq, right_freq);
|
||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||
best = Some((min_freq, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, l, r)| (l, r)))
|
||||
}
|
||||
|
||||
fn generate_automatons(
|
||||
reader: &heed::RoTxn,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
|
||||
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
@ -136,7 +193,7 @@ fn generate_automatons(
|
||||
original_automatons.push(automaton);
|
||||
}
|
||||
|
||||
automatons.push(original_automatons);
|
||||
automatons.push(AutomatonGroup::normal(original_automatons));
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
@ -188,13 +245,27 @@ fn generate_automatons(
|
||||
Automaton::non_exact(automaton_index, n, synonym)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(vec![automaton]);
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n != 1 {
|
||||
if n == 1 {
|
||||
if let Some((left, right)) =
|
||||
split_best_frequency(reader, &normalized, postings_lists_store)?
|
||||
{
|
||||
let a = Automaton::exact(automaton_index, 1, left);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
let b = Automaton::exact(automaton_index, 1, right);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
|
||||
}
|
||||
} else {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
@ -204,16 +275,20 @@ fn generate_automatons(
|
||||
|
||||
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
||||
automaton_index += 1;
|
||||
automatons.push(vec![automaton]);
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// order automatons, the most important first,
|
||||
// we keep the original automatons at the front.
|
||||
automatons[1..].sort_by_key(|a| {
|
||||
let a = a.first().unwrap();
|
||||
(Reverse(a.is_exact), a.ngram)
|
||||
automatons[1..].sort_by_key(|group| {
|
||||
let a = group.automatons.first().unwrap();
|
||||
(
|
||||
Reverse(a.is_exact),
|
||||
a.ngram,
|
||||
Reverse(group.automatons.len()),
|
||||
)
|
||||
});
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
|
@ -21,16 +21,15 @@ fn number_exact_matches(
|
||||
let len = group.len();
|
||||
|
||||
let mut found_exact = false;
|
||||
for (pos, _) in is_exact[index..index + len]
|
||||
.iter()
|
||||
.filter(|x| **x)
|
||||
.enumerate()
|
||||
{
|
||||
found_exact = true;
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
|
||||
let (_, count) = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value();
|
||||
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
|
||||
if *is_exact {
|
||||
found_exact = true;
|
||||
let attr = &attribute[index + pos];
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
|
||||
let (_, count) = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -35,8 +35,13 @@ fn update_awaiter(receiver: Receiver<()>, env: heed::Env, update_fn: Arc<ArcSwap
|
||||
|
||||
match update::update_task(&mut writer, index.clone()) {
|
||||
Ok(Some(status)) => {
|
||||
if let Err(e) = writer.commit() {
|
||||
error!("update transaction failed: {}", e)
|
||||
match status.result {
|
||||
Ok(_) => {
|
||||
if let Err(e) = writer.commit() {
|
||||
error!("update transaction failed: {}", e)
|
||||
}
|
||||
}
|
||||
Err(_) => writer.abort(),
|
||||
}
|
||||
|
||||
if let Some(ref callback) = *update_fn.load() {
|
||||
|
@ -12,7 +12,6 @@ pub enum Error {
|
||||
SchemaMissing,
|
||||
WordIndexMissing,
|
||||
MissingDocumentId,
|
||||
DuplicateDocument,
|
||||
Zlmdb(heed::Error),
|
||||
Fst(fst::Error),
|
||||
SerdeJson(SerdeJsonError),
|
||||
@ -80,7 +79,6 @@ impl fmt::Display for Error {
|
||||
SchemaMissing => write!(f, "this index does not have a schema"),
|
||||
WordIndexMissing => write!(f, "this index does not have a word index"),
|
||||
MissingDocumentId => write!(f, "document id is missing"),
|
||||
DuplicateDocument => write!(f, "update contains documents with the same id"),
|
||||
Zlmdb(e) => write!(f, "heed error; {}", e),
|
||||
Fst(e) => write!(f, "fst error; {}", e),
|
||||
SerdeJson(e) => write!(f, "serde json error; {}", e),
|
||||
|
@ -1,4 +1,5 @@
|
||||
use hashbrown::HashMap;
|
||||
use std::convert::TryFrom;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
@ -8,7 +9,7 @@ use fst::{IntoStreamer, Streamer};
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::{GroupBy, GroupByMut};
|
||||
|
||||
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
|
||||
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
|
||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
||||
@ -138,7 +139,7 @@ fn multiword_rewrite_matches(
|
||||
|
||||
fn fetch_raw_documents(
|
||||
reader: &heed::RoTxn,
|
||||
automatons: &[Automaton],
|
||||
automatons_groups: &[AutomatonGroup],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
searchables: Option<&ReorderedAttrs>,
|
||||
main_store: store::Main,
|
||||
@ -148,55 +149,94 @@ fn fetch_raw_documents(
|
||||
let mut matches = Vec::new();
|
||||
let mut highlights = Vec::new();
|
||||
|
||||
for automaton in automatons {
|
||||
let Automaton {
|
||||
index,
|
||||
is_exact,
|
||||
query_len,
|
||||
..
|
||||
} = automaton;
|
||||
let dfa = automaton.dfa();
|
||||
for group in automatons_groups {
|
||||
let AutomatonGroup {
|
||||
is_phrase_query,
|
||||
automatons,
|
||||
} = group;
|
||||
let phrase_query_len = automatons.len();
|
||||
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
let mut tmp_matches = Vec::new();
|
||||
for (id, automaton) in automatons.into_iter().enumerate() {
|
||||
let Automaton {
|
||||
index,
|
||||
is_exact,
|
||||
query_len,
|
||||
..
|
||||
} = automaton;
|
||||
let dfa = automaton.dfa();
|
||||
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
while let Some(input) = stream.next() {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
||||
|
||||
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
||||
Some(doc_indexes) => doc_indexes,
|
||||
None => continue,
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
matches.reserve(doc_indexes.len());
|
||||
highlights.reserve(doc_indexes.len());
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
while let Some(input) = stream.next() {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
||||
|
||||
for di in doc_indexes.as_ref() {
|
||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||
if let Some(attribute) = attribute {
|
||||
let match_ = TmpMatch {
|
||||
query_index: *index as u32,
|
||||
distance,
|
||||
attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact,
|
||||
};
|
||||
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
||||
Some(doc_indexes) => doc_indexes,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let highlight = Highlight {
|
||||
attribute: di.attribute,
|
||||
char_index: di.char_index,
|
||||
char_length: di.char_length,
|
||||
};
|
||||
tmp_matches.reserve(doc_indexes.len());
|
||||
|
||||
matches.push((di.document_id, match_));
|
||||
highlights.push((di.document_id, highlight));
|
||||
for di in doc_indexes.as_ref() {
|
||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||
if let Some(attribute) = attribute {
|
||||
let match_ = TmpMatch {
|
||||
query_index: *index as u32,
|
||||
distance,
|
||||
attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact,
|
||||
};
|
||||
|
||||
let highlight = Highlight {
|
||||
attribute: di.attribute,
|
||||
char_index: di.char_index,
|
||||
char_length: u16::try_from(*query_len).unwrap_or(u16::max_value()),
|
||||
};
|
||||
|
||||
tmp_matches.push((di.document_id, id, match_, highlight));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if *is_phrase_query {
|
||||
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
|
||||
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
|
||||
for window in group.windows(2) {
|
||||
let (ida, ia, ma, ha) = window[0];
|
||||
let (idb, ib, mb, hb) = window[1];
|
||||
|
||||
debug_assert_eq!(ida, idb);
|
||||
|
||||
// if matches must follow and actually follows themselves
|
||||
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
|
||||
// TODO we must make it work for phrase query longer than 2
|
||||
// if the second match is the last phrase query word
|
||||
if ib + 1 == phrase_query_len {
|
||||
// insert first match
|
||||
matches.push((ida, ma));
|
||||
highlights.push((ida, ha));
|
||||
|
||||
// insert second match
|
||||
matches.push((idb, mb));
|
||||
highlights.push((idb, hb));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (id, _, match_, highlight) in tmp_matches {
|
||||
matches.push((id, match_));
|
||||
highlights.push((id, highlight));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
||||
@ -367,15 +407,20 @@ where
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
||||
|
||||
let (automaton_producer, query_enhancer) =
|
||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
let automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
|
||||
// aggregate automatons groups by groups after time
|
||||
for auts in automaton_producer {
|
||||
automatons.extend(auts);
|
||||
automatons.push(auts);
|
||||
|
||||
// we must retrieve the documents associated
|
||||
// with the current automatons
|
||||
@ -480,15 +525,20 @@ where
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::new();
|
||||
|
||||
let (automaton_producer, query_enhancer) =
|
||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
let automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
|
||||
// aggregate automatons groups by groups after time
|
||||
for auts in automaton_producer {
|
||||
automatons.extend(auts);
|
||||
automatons.push(auts);
|
||||
|
||||
// we must retrieve the documents associated
|
||||
// with the current automatons
|
||||
@ -1697,4 +1747,68 @@ mod tests {
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_phrase_query_splitting() {
|
||||
let store = TempDatabase::from_iter(vec![
|
||||
("search", &[doc_index(0, 0)][..]),
|
||||
("engine", &[doc_index(0, 1)][..]),
|
||||
("search", &[doc_index(1, 0)][..]),
|
||||
("slow", &[doc_index(1, 1)][..]),
|
||||
("engine", &[doc_index(1, 2)][..]),
|
||||
]);
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn harder_phrase_query_splitting() {
|
||||
let store = TempDatabase::from_iter(vec![
|
||||
("search", &[doc_index(0, 0)][..]),
|
||||
("search", &[doc_index(0, 1)][..]),
|
||||
("engine", &[doc_index(0, 2)][..]),
|
||||
("search", &[doc_index(1, 0)][..]),
|
||||
("slow", &[doc_index(1, 1)][..]),
|
||||
("search", &[doc_index(1, 2)][..]),
|
||||
("engine", &[doc_index(1, 3)][..]),
|
||||
("search", &[doc_index(1, 0)][..]),
|
||||
("search", &[doc_index(1, 1)][..]),
|
||||
("slow", &[doc_index(1, 2)][..]),
|
||||
("engine", &[doc_index(1, 3)][..]),
|
||||
]);
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
}
|
||||
|
@ -26,6 +26,10 @@ impl DocsWords {
|
||||
self.docs_words.delete(writer, &document_id)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.docs_words.clear(writer)
|
||||
}
|
||||
|
||||
pub fn doc_words(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
|
@ -32,6 +32,10 @@ impl DocumentsFields {
|
||||
self.documents_fields.delete_range(writer, start..=end)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.documents_fields.clear(writer)
|
||||
}
|
||||
|
||||
pub fn document_attribute<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
|
@ -32,6 +32,10 @@ impl DocumentsFieldsCounts {
|
||||
.delete_range(writer, start..=end)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.documents_fields_counts.clear(writer)
|
||||
}
|
||||
|
||||
pub fn document_field_count(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
|
@ -166,6 +166,11 @@ impl Index {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn clear_all(&self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
let _ = self.updates_notifier.send(());
|
||||
update::push_clear_all(writer, self.updates, self.updates_results)
|
||||
}
|
||||
|
||||
pub fn synonyms_addition(&self) -> update::SynonymsAddition {
|
||||
update::SynonymsAddition::new(
|
||||
self.updates,
|
||||
@ -197,6 +202,20 @@ impl Index {
|
||||
update::update_status(reader, self.updates, self.updates_results, update_id)
|
||||
}
|
||||
|
||||
pub fn all_updates_status(&self, reader: &heed::RoTxn) -> MResult<Vec<update::UpdateStatus>> {
|
||||
match self.updates_results.last_update_id(reader)? {
|
||||
Some((last_id, _)) => {
|
||||
let mut updates = Vec::with_capacity(last_id as usize + 1);
|
||||
for id in 0..=last_id {
|
||||
let update = self.update_status(reader, id)?;
|
||||
updates.push(update);
|
||||
}
|
||||
Ok(updates)
|
||||
}
|
||||
None => Ok(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query_builder(&self) -> QueryBuilder {
|
||||
QueryBuilder::new(
|
||||
self.main,
|
||||
|
33
meilidb-core/src/update/clear_all.rs
Normal file
33
meilidb-core/src/update/clear_all.rs
Normal file
@ -0,0 +1,33 @@
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult, RankedMap};
|
||||
|
||||
pub fn apply_clear_all(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
||||
main_store.put_ranked_map(writer, &RankedMap::default())?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
documents_fields_store.clear(writer)?;
|
||||
documents_fields_counts_store.clear(writer)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn push_clear_all(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
let update = Update::ClearAll;
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use fst::{set::OpBuilder, SetBuilder};
|
||||
use sdset::{duo::Union, SetOperation};
|
||||
@ -86,7 +86,7 @@ pub fn apply_documents_addition(
|
||||
docs_words_store: store::DocsWords,
|
||||
addition: Vec<serde_json::Value>,
|
||||
) -> MResult<()> {
|
||||
let mut documents_ids = HashSet::new();
|
||||
let mut documents_additions = HashMap::new();
|
||||
let mut indexer = RawIndexer::new();
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
@ -97,19 +97,18 @@ pub fn apply_documents_addition(
|
||||
let identifier = schema.identifier_name();
|
||||
|
||||
// 1. store documents ids for future deletion
|
||||
for document in addition.iter() {
|
||||
for document in addition {
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
if !documents_ids.insert(document_id) {
|
||||
return Err(Error::DuplicateDocument);
|
||||
}
|
||||
documents_additions.insert(document_id, document);
|
||||
}
|
||||
|
||||
// 2. remove the documents posting lists
|
||||
let number_of_inserted_documents = documents_ids.len();
|
||||
let number_of_inserted_documents = documents_additions.len();
|
||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
||||
apply_documents_deletion(
|
||||
writer,
|
||||
main_store,
|
||||
@ -117,7 +116,7 @@ pub fn apply_documents_addition(
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
documents_ids.into_iter().collect(),
|
||||
documents_ids,
|
||||
)?;
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
@ -126,12 +125,7 @@ pub fn apply_documents_addition(
|
||||
};
|
||||
|
||||
// 3. index the documents fields in the stores
|
||||
for document in addition {
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
for (document_id, document) in documents_additions {
|
||||
let serializer = Serializer {
|
||||
txn: writer,
|
||||
schema: &schema,
|
||||
@ -184,6 +178,7 @@ pub fn reindex_all_documents(
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
|
||||
// 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
|
||||
let mut indexer = RawIndexer::new();
|
||||
|
@ -1,3 +1,4 @@
|
||||
mod clear_all;
|
||||
mod customs_update;
|
||||
mod documents_addition;
|
||||
mod documents_deletion;
|
||||
@ -5,6 +6,7 @@ mod schema_update;
|
||||
mod synonyms_addition;
|
||||
mod synonyms_deletion;
|
||||
|
||||
pub use self::clear_all::{apply_clear_all, push_clear_all};
|
||||
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
||||
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
||||
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||
@ -25,6 +27,7 @@ use meilidb_schema::Schema;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum Update {
|
||||
ClearAll,
|
||||
Schema(Schema),
|
||||
Customs(Vec<u8>),
|
||||
DocumentsAddition(Vec<serde_json::Value>),
|
||||
@ -35,6 +38,7 @@ pub enum Update {
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum UpdateType {
|
||||
ClearAll,
|
||||
Schema { schema: Schema },
|
||||
Customs,
|
||||
DocumentsAddition { number: usize },
|
||||
@ -43,12 +47,12 @@ pub enum UpdateType {
|
||||
SynonymsDeletion { number: usize },
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DetailedDuration {
|
||||
pub main: Duration,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct UpdateResult {
|
||||
pub update_id: u64,
|
||||
pub update_type: UpdateType,
|
||||
@ -56,7 +60,7 @@ pub struct UpdateResult {
|
||||
pub detailed_duration: DetailedDuration,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum UpdateStatus {
|
||||
Enqueued,
|
||||
Processed(UpdateResult),
|
||||
@ -107,6 +111,21 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
||||
debug!("Processing update number {}", update_id);
|
||||
|
||||
let (update_type, result, duration) = match update {
|
||||
Update::ClearAll => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::ClearAll;
|
||||
let result = apply_clear_all(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
Update::Schema(schema) => {
|
||||
let start = Instant::now();
|
||||
|
||||
|
Reference in New Issue
Block a user