mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-20 21:30:58 +00:00
Compare commits
20 Commits
Author | SHA1 | Date | |
---|---|---|---|
a17dccd84e | |||
9a57cab3ee | |||
751b060320 | |||
4111b99a6d | |||
d6fb2b56d1 | |||
cb5c77e536 | |||
44c89b1ea2 | |||
26a285053b | |||
1446a6a2d2 | |||
047eba3ff3 | |||
8d9d183ce6 | |||
eb67195840 | |||
93306c2326 | |||
7d9cf8d713 | |||
03eb7898e7 | |||
0fbd4cd632 | |||
858bf359b8 | |||
5dc8465ebd | |||
0f30a221fa | |||
e86a547e93 |
@ -12,6 +12,7 @@ A _full-text search database_ based on the fast [LMDB key-value store](https://e
|
||||
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order
|
||||
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L283), useful for paginating results
|
||||
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L265-L270) and [filter](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L246-L259) returned documents based on context defined rules
|
||||
- Searches for [concatenated](https://github.com/meilisearch/MeiliDB/pull/164) and [splitted query words](https://github.com/meilisearch/MeiliDB/pull/232) to improve the search quality.
|
||||
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-schema/src/lib.rs#L265-L279)
|
||||
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages
|
||||
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results
|
||||
|
@ -2,7 +2,7 @@ mod dfa;
|
||||
mod query_enhancer;
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::vec;
|
||||
use std::{cmp, vec};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
@ -18,7 +18,7 @@ use self::query_enhancer::QueryEnhancerBuilder;
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
pub struct AutomatonProducer {
|
||||
automatons: Vec<Vec<Automaton>>,
|
||||
automatons: Vec<AutomatonGroup>,
|
||||
}
|
||||
|
||||
impl AutomatonProducer {
|
||||
@ -26,19 +26,47 @@ impl AutomatonProducer {
|
||||
reader: &heed::RoTxn,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_list_store: store::PostingsLists,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||
let (automatons, query_enhancer) =
|
||||
generate_automatons(reader, query, main_store, synonyms_store)?;
|
||||
let (automatons, query_enhancer) = generate_automatons(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_list_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||
}
|
||||
|
||||
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
|
||||
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
|
||||
self.automatons.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AutomatonGroup {
|
||||
pub is_phrase_query: bool,
|
||||
pub automatons: Vec<Automaton>,
|
||||
}
|
||||
|
||||
impl AutomatonGroup {
|
||||
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: false,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: true,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Automaton {
|
||||
pub index: usize,
|
||||
@ -102,12 +130,41 @@ pub fn normalize_str(string: &str) -> String {
|
||||
string
|
||||
}
|
||||
|
||||
fn split_best_frequency<'a>(
|
||||
reader: &heed::RoTxn,
|
||||
word: &'a str,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Option<(&'a str, &'a str)>> {
|
||||
let chars = word.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = word.split_at(i);
|
||||
|
||||
let left_freq = postings_lists_store
|
||||
.postings_list(reader, left.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let right_freq = postings_lists_store
|
||||
.postings_list(reader, right.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let min_freq = cmp::min(left_freq, right_freq);
|
||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||
best = Some((min_freq, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, l, r)| (l, r)))
|
||||
}
|
||||
|
||||
fn generate_automatons(
|
||||
reader: &heed::RoTxn,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
|
||||
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
@ -136,7 +193,7 @@ fn generate_automatons(
|
||||
original_automatons.push(automaton);
|
||||
}
|
||||
|
||||
automatons.push(original_automatons);
|
||||
automatons.push(AutomatonGroup::normal(original_automatons));
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
@ -188,13 +245,27 @@ fn generate_automatons(
|
||||
Automaton::non_exact(automaton_index, n, synonym)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(vec![automaton]);
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n != 1 {
|
||||
if n == 1 {
|
||||
if let Some((left, right)) =
|
||||
split_best_frequency(reader, &normalized, postings_lists_store)?
|
||||
{
|
||||
let a = Automaton::exact(automaton_index, 1, left);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
let b = Automaton::exact(automaton_index, 1, right);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
|
||||
}
|
||||
} else {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
@ -204,16 +275,20 @@ fn generate_automatons(
|
||||
|
||||
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
||||
automaton_index += 1;
|
||||
automatons.push(vec![automaton]);
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// order automatons, the most important first,
|
||||
// we keep the original automatons at the front.
|
||||
automatons[1..].sort_by_key(|a| {
|
||||
let a = a.first().unwrap();
|
||||
(Reverse(a.is_exact), a.ngram)
|
||||
automatons[1..].sort_by_key(|group| {
|
||||
let a = group.automatons.first().unwrap();
|
||||
(
|
||||
Reverse(a.is_exact),
|
||||
a.ngram,
|
||||
Reverse(group.automatons.len()),
|
||||
)
|
||||
});
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
|
@ -21,16 +21,15 @@ fn number_exact_matches(
|
||||
let len = group.len();
|
||||
|
||||
let mut found_exact = false;
|
||||
for (pos, _) in is_exact[index..index + len]
|
||||
.iter()
|
||||
.filter(|x| **x)
|
||||
.enumerate()
|
||||
{
|
||||
found_exact = true;
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
|
||||
let (_, count) = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value();
|
||||
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
|
||||
if *is_exact {
|
||||
found_exact = true;
|
||||
let attr = &attribute[index + pos];
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
|
||||
let (_, count) = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -35,8 +35,10 @@ fn update_awaiter(receiver: Receiver<()>, env: heed::Env, update_fn: Arc<ArcSwap
|
||||
|
||||
match update::update_task(&mut writer, index.clone()) {
|
||||
Ok(Some(status)) => {
|
||||
if let Err(e) = writer.commit() {
|
||||
error!("update transaction failed: {}", e)
|
||||
if status.result.is_ok() {
|
||||
if let Err(e) = writer.commit() {
|
||||
error!("update transaction failed: {}", e)
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ref callback) = *update_fn.load() {
|
||||
|
@ -12,6 +12,7 @@ pub enum Error {
|
||||
SchemaMissing,
|
||||
WordIndexMissing,
|
||||
MissingDocumentId,
|
||||
DuplicateDocument,
|
||||
Zlmdb(heed::Error),
|
||||
Fst(fst::Error),
|
||||
SerdeJson(SerdeJsonError),
|
||||
@ -79,6 +80,7 @@ impl fmt::Display for Error {
|
||||
SchemaMissing => write!(f, "this index does not have a schema"),
|
||||
WordIndexMissing => write!(f, "this index does not have a word index"),
|
||||
MissingDocumentId => write!(f, "document id is missing"),
|
||||
DuplicateDocument => write!(f, "update contains documents with the same id"),
|
||||
Zlmdb(e) => write!(f, "heed error; {}", e),
|
||||
Fst(e) => write!(f, "fst error; {}", e),
|
||||
SerdeJson(e) => write!(f, "serde json error; {}", e),
|
||||
@ -95,6 +97,10 @@ impl error::Error for Error {}
|
||||
#[derive(Debug)]
|
||||
pub enum UnsupportedOperation {
|
||||
SchemaAlreadyExists,
|
||||
CannotUpdateSchemaIdentifier,
|
||||
CannotReorderSchemaAttribute,
|
||||
CannotIntroduceNewSchemaAttribute,
|
||||
CannotRemoveSchemaAttribute,
|
||||
}
|
||||
|
||||
impl fmt::Display for UnsupportedOperation {
|
||||
@ -102,6 +108,12 @@ impl fmt::Display for UnsupportedOperation {
|
||||
use self::UnsupportedOperation::*;
|
||||
match self {
|
||||
SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"),
|
||||
CannotUpdateSchemaIdentifier => write!(f, "Cannot update the identifier of a schema"),
|
||||
CannotReorderSchemaAttribute => write!(f, "Cannot reorder the attributes of a schema"),
|
||||
CannotIntroduceNewSchemaAttribute => {
|
||||
write!(f, "Cannot introduce new attributes in a schema")
|
||||
}
|
||||
CannotRemoveSchemaAttribute => write!(f, "Cannot remove attributes from a schema"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
use hashbrown::HashMap;
|
||||
use std::convert::TryFrom;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
@ -8,7 +9,7 @@ use fst::{IntoStreamer, Streamer};
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::{GroupBy, GroupByMut};
|
||||
|
||||
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
|
||||
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
|
||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
||||
@ -138,7 +139,7 @@ fn multiword_rewrite_matches(
|
||||
|
||||
fn fetch_raw_documents(
|
||||
reader: &heed::RoTxn,
|
||||
automatons: &[Automaton],
|
||||
automatons_groups: &[AutomatonGroup],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
searchables: Option<&ReorderedAttrs>,
|
||||
main_store: store::Main,
|
||||
@ -148,55 +149,94 @@ fn fetch_raw_documents(
|
||||
let mut matches = Vec::new();
|
||||
let mut highlights = Vec::new();
|
||||
|
||||
for automaton in automatons {
|
||||
let Automaton {
|
||||
index,
|
||||
is_exact,
|
||||
query_len,
|
||||
..
|
||||
} = automaton;
|
||||
let dfa = automaton.dfa();
|
||||
for group in automatons_groups {
|
||||
let AutomatonGroup {
|
||||
is_phrase_query,
|
||||
automatons,
|
||||
} = group;
|
||||
let phrase_query_len = automatons.len();
|
||||
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
let mut tmp_matches = Vec::new();
|
||||
for (id, automaton) in automatons.into_iter().enumerate() {
|
||||
let Automaton {
|
||||
index,
|
||||
is_exact,
|
||||
query_len,
|
||||
..
|
||||
} = automaton;
|
||||
let dfa = automaton.dfa();
|
||||
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
while let Some(input) = stream.next() {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
||||
|
||||
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
||||
Some(doc_indexes) => doc_indexes,
|
||||
None => continue,
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
matches.reserve(doc_indexes.len());
|
||||
highlights.reserve(doc_indexes.len());
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
while let Some(input) = stream.next() {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
||||
|
||||
for di in doc_indexes.as_ref() {
|
||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||
if let Some(attribute) = attribute {
|
||||
let match_ = TmpMatch {
|
||||
query_index: *index as u32,
|
||||
distance,
|
||||
attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact,
|
||||
};
|
||||
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
||||
Some(doc_indexes) => doc_indexes,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let highlight = Highlight {
|
||||
attribute: di.attribute,
|
||||
char_index: di.char_index,
|
||||
char_length: di.char_length,
|
||||
};
|
||||
tmp_matches.reserve(doc_indexes.len());
|
||||
|
||||
matches.push((di.document_id, match_));
|
||||
highlights.push((di.document_id, highlight));
|
||||
for di in doc_indexes.as_ref() {
|
||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||
if let Some(attribute) = attribute {
|
||||
let match_ = TmpMatch {
|
||||
query_index: *index as u32,
|
||||
distance,
|
||||
attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact,
|
||||
};
|
||||
|
||||
let highlight = Highlight {
|
||||
attribute: di.attribute,
|
||||
char_index: di.char_index,
|
||||
char_length: u16::try_from(*query_len).unwrap_or(u16::max_value()),
|
||||
};
|
||||
|
||||
tmp_matches.push((di.document_id, id, match_, highlight));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if *is_phrase_query {
|
||||
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
|
||||
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
|
||||
for window in group.windows(2) {
|
||||
let (ida, ia, ma, ha) = window[0];
|
||||
let (idb, ib, mb, hb) = window[1];
|
||||
|
||||
debug_assert_eq!(ida, idb);
|
||||
|
||||
// if matches must follow and actually follows themselves
|
||||
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
|
||||
// TODO we must make it work for phrase query longer than 2
|
||||
// if the second match is the last phrase query word
|
||||
if ib + 1 == phrase_query_len {
|
||||
// insert first match
|
||||
matches.push((ida, ma));
|
||||
highlights.push((ida, ha));
|
||||
|
||||
// insert second match
|
||||
matches.push((idb, mb));
|
||||
highlights.push((idb, hb));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (id, _, match_, highlight) in tmp_matches {
|
||||
matches.push((id, match_));
|
||||
highlights.push((id, highlight));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
||||
@ -367,15 +407,20 @@ where
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
||||
|
||||
let (automaton_producer, query_enhancer) =
|
||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
let automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
|
||||
// aggregate automatons groups by groups after time
|
||||
for auts in automaton_producer {
|
||||
automatons.extend(auts);
|
||||
automatons.push(auts);
|
||||
|
||||
// we must retrieve the documents associated
|
||||
// with the current automatons
|
||||
@ -480,15 +525,20 @@ where
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::new();
|
||||
|
||||
let (automaton_producer, query_enhancer) =
|
||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
let automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
|
||||
// aggregate automatons groups by groups after time
|
||||
for auts in automaton_producer {
|
||||
automatons.extend(auts);
|
||||
automatons.push(auts);
|
||||
|
||||
// we must retrieve the documents associated
|
||||
// with the current automatons
|
||||
@ -1697,4 +1747,68 @@ mod tests {
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_phrase_query_splitting() {
|
||||
let store = TempDatabase::from_iter(vec![
|
||||
("search", &[doc_index(0, 0)][..]),
|
||||
("engine", &[doc_index(0, 1)][..]),
|
||||
("search", &[doc_index(1, 0)][..]),
|
||||
("slow", &[doc_index(1, 1)][..]),
|
||||
("engine", &[doc_index(1, 2)][..]),
|
||||
]);
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn harder_phrase_query_splitting() {
|
||||
let store = TempDatabase::from_iter(vec![
|
||||
("search", &[doc_index(0, 0)][..]),
|
||||
("search", &[doc_index(0, 1)][..]),
|
||||
("engine", &[doc_index(0, 2)][..]),
|
||||
("search", &[doc_index(1, 0)][..]),
|
||||
("slow", &[doc_index(1, 1)][..]),
|
||||
("search", &[doc_index(1, 2)][..]),
|
||||
("engine", &[doc_index(1, 3)][..]),
|
||||
("search", &[doc_index(1, 0)][..]),
|
||||
("search", &[doc_index(1, 1)][..]),
|
||||
("slow", &[doc_index(1, 2)][..]),
|
||||
("engine", &[doc_index(1, 3)][..]),
|
||||
]);
|
||||
|
||||
let env = &store.database.env;
|
||||
let reader = env.read_txn().unwrap();
|
||||
|
||||
let builder = store.query_builder();
|
||||
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
|
||||
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
}
|
||||
|
@ -12,8 +12,8 @@ impl ser::Serializer for ConvertToString {
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapConvertToString;
|
||||
type SerializeStruct = StructConvertToString;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
@ -169,7 +169,9 @@ impl ser::Serializer for ConvertToString {
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { type_name: "map" })
|
||||
Ok(MapConvertToString {
|
||||
text: String::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
@ -177,8 +179,8 @@ impl ser::Serializer for ConvertToString {
|
||||
_name: &'static str,
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Err(SerializerError::UnserializableType {
|
||||
type_name: "struct",
|
||||
Ok(StructConvertToString {
|
||||
text: String::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@ -194,3 +196,63 @@ impl ser::Serializer for ConvertToString {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapConvertToString {
|
||||
text: String,
|
||||
}
|
||||
|
||||
impl ser::SerializeMap for MapConvertToString {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = key.serialize(ConvertToString)?;
|
||||
self.text.push_str(&text);
|
||||
self.text.push_str(" ");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let text = value.serialize(ConvertToString)?;
|
||||
self.text.push_str(&text);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.text)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructConvertToString {
|
||||
text: String,
|
||||
}
|
||||
|
||||
impl ser::SerializeStruct for StructConvertToString {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T,
|
||||
) -> Result<(), Self::Error>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
let value = value.serialize(ConvertToString)?;
|
||||
self.text.push_str(key);
|
||||
self.text.push_str(" ");
|
||||
self.text.push_str(&value);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(self.text)
|
||||
}
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapIndexer<'a>;
|
||||
type SerializeStruct = StructSerializer<'a>;
|
||||
type SerializeStruct = StructIndexer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
|
||||
@ -302,14 +302,14 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a> {
|
||||
pub struct StructIndexer<'a> {
|
||||
attribute: SchemaAttr,
|
||||
document_id: DocumentId,
|
||||
indexer: &'a mut RawIndexer,
|
||||
texts: Vec<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
impl<'a> ser::SerializeStruct for StructIndexer<'a> {
|
||||
type Ok = Option<usize>;
|
||||
type Error = SerializerError;
|
||||
|
||||
|
@ -20,16 +20,14 @@ pub use self::convert_to_string::ConvertToString;
|
||||
pub use self::deserializer::{Deserializer, DeserializerError};
|
||||
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
|
||||
pub use self::indexer::Indexer;
|
||||
pub use self::serializer::Serializer;
|
||||
pub use self::serializer::{serialize_value, Serializer};
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::{error::Error, fmt};
|
||||
|
||||
use meilidb_schema::SchemaAttr;
|
||||
use serde::ser;
|
||||
use serde_json::Error as SerdeJsonError;
|
||||
|
||||
use crate::{DocumentId, ParseNumberError};
|
||||
use crate::ParseNumberError;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
@ -103,25 +101,3 @@ impl From<ParseNumberError> for SerializerError {
|
||||
SerializerError::ParseNumber(error)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
|
||||
|
||||
impl RamDocumentStore {
|
||||
pub fn new() -> RamDocumentStore {
|
||||
RamDocumentStore(BTreeMap::new())
|
||||
}
|
||||
|
||||
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
|
||||
self.0.insert((id, attr), value);
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RamDocumentStore {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
@ -1,17 +1,17 @@
|
||||
use meilidb_schema::{Schema, SchemaAttr};
|
||||
use meilidb_schema::{Schema, SchemaAttr, SchemaProps};
|
||||
use serde::ser;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::RamDocumentStore;
|
||||
use crate::store::{DocumentsFields, DocumentsFieldsCounts};
|
||||
use crate::{DocumentId, RankedMap};
|
||||
|
||||
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
|
||||
|
||||
pub struct Serializer<'a> {
|
||||
pub txn: &'a mut heed::RwTxn,
|
||||
pub schema: &'a Schema,
|
||||
pub document_store: &'a mut RamDocumentStore,
|
||||
pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||
pub document_store: DocumentsFields,
|
||||
pub document_fields_counts: DocumentsFieldsCounts,
|
||||
pub indexer: &'a mut RawIndexer,
|
||||
pub ranked_map: &'a mut RankedMap,
|
||||
pub document_id: DocumentId,
|
||||
@ -150,6 +150,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(MapSerializer {
|
||||
txn: self.txn,
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
document_store: self.document_store,
|
||||
@ -166,6 +167,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
_len: usize,
|
||||
) -> Result<Self::SerializeStruct, Self::Error> {
|
||||
Ok(StructSerializer {
|
||||
txn: self.txn,
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
document_store: self.document_store,
|
||||
@ -189,10 +191,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a> {
|
||||
txn: &'a mut heed::RwTxn,
|
||||
schema: &'a Schema,
|
||||
document_id: DocumentId,
|
||||
document_store: &'a mut RamDocumentStore,
|
||||
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||
document_store: DocumentsFields,
|
||||
document_fields_counts: DocumentsFieldsCounts,
|
||||
indexer: &'a mut RawIndexer,
|
||||
ranked_map: &'a mut RankedMap,
|
||||
current_key_name: Option<String>,
|
||||
@ -229,17 +232,20 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
||||
V: ser::Serialize,
|
||||
{
|
||||
let key = key.serialize(ConvertToString)?;
|
||||
|
||||
serialize_value(
|
||||
self.schema,
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.document_fields_counts,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
&key,
|
||||
value,
|
||||
)
|
||||
match self.schema.attribute(&key) {
|
||||
Some(attribute) => serialize_value(
|
||||
self.txn,
|
||||
attribute,
|
||||
self.schema.props(attribute),
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.document_fields_counts,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
value,
|
||||
),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
@ -248,10 +254,11 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a> {
|
||||
txn: &'a mut heed::RwTxn,
|
||||
schema: &'a Schema,
|
||||
document_id: DocumentId,
|
||||
document_store: &'a mut RamDocumentStore,
|
||||
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||
document_store: DocumentsFields,
|
||||
document_fields_counts: DocumentsFieldsCounts,
|
||||
indexer: &'a mut RawIndexer,
|
||||
ranked_map: &'a mut RankedMap,
|
||||
}
|
||||
@ -268,16 +275,20 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
serialize_value(
|
||||
self.schema,
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.document_fields_counts,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
key,
|
||||
value,
|
||||
)
|
||||
match self.schema.attribute(key) {
|
||||
Some(attribute) => serialize_value(
|
||||
self.txn,
|
||||
attribute,
|
||||
self.schema.props(attribute),
|
||||
self.document_id,
|
||||
self.document_store,
|
||||
self.document_fields_counts,
|
||||
self.indexer,
|
||||
self.ranked_map,
|
||||
value,
|
||||
),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
@ -285,40 +296,42 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(
|
||||
schema: &Schema,
|
||||
pub fn serialize_value<T: ?Sized>(
|
||||
txn: &mut heed::RwTxn,
|
||||
attribute: SchemaAttr,
|
||||
props: SchemaProps,
|
||||
document_id: DocumentId,
|
||||
document_store: &mut RamDocumentStore,
|
||||
documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>,
|
||||
document_store: DocumentsFields,
|
||||
documents_fields_counts: DocumentsFieldsCounts,
|
||||
indexer: &mut RawIndexer,
|
||||
ranked_map: &mut RankedMap,
|
||||
key: &str,
|
||||
value: &T,
|
||||
) -> Result<(), SerializerError>
|
||||
where
|
||||
T: ser::Serialize,
|
||||
{
|
||||
if let Some(attribute) = schema.attribute(key) {
|
||||
let props = schema.props(attribute);
|
||||
let serialized = serde_json::to_vec(value)?;
|
||||
document_store.put_document_field(txn, document_id, attribute, &serialized)?;
|
||||
|
||||
let serialized = serde_json::to_vec(value)?;
|
||||
document_store.set_document_field(document_id, attribute, serialized);
|
||||
|
||||
if props.is_indexed() {
|
||||
let indexer = Indexer {
|
||||
attribute,
|
||||
indexer,
|
||||
if props.is_indexed() {
|
||||
let indexer = Indexer {
|
||||
attribute,
|
||||
indexer,
|
||||
document_id,
|
||||
};
|
||||
if let Some(number_of_words) = value.serialize(indexer)? {
|
||||
documents_fields_counts.put_document_field_count(
|
||||
txn,
|
||||
document_id,
|
||||
};
|
||||
if let Some(number_of_words) = value.serialize(indexer)? {
|
||||
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
|
||||
}
|
||||
attribute,
|
||||
number_of_words as u64,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
if props.is_ranked() {
|
||||
let number = value.serialize(ConvertToNumber)?;
|
||||
ranked_map.insert(document_id, attribute, number);
|
||||
}
|
||||
if props.is_ranked() {
|
||||
let number = value.serialize(ConvertToNumber)?;
|
||||
ranked_map.insert(document_id, attribute, number);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
@ -26,6 +26,10 @@ impl DocsWords {
|
||||
self.docs_words.delete(writer, &document_id)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.docs_words.clear(writer)
|
||||
}
|
||||
|
||||
pub fn doc_words(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
|
@ -32,6 +32,10 @@ impl DocumentsFields {
|
||||
self.documents_fields.delete_range(writer, start..=end)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.documents_fields.clear(writer)
|
||||
}
|
||||
|
||||
pub fn document_attribute<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
|
@ -32,6 +32,10 @@ impl DocumentsFieldsCounts {
|
||||
.delete_range(writer, start..=end)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.documents_fields_counts.clear(writer)
|
||||
}
|
||||
|
||||
pub fn document_field_count(
|
||||
self,
|
||||
reader: &heed::RoTxn,
|
||||
@ -121,7 +125,7 @@ pub struct AllDocumentsFieldsCountsIter<'txn> {
|
||||
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
|
||||
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
|
||||
type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
|
@ -166,6 +166,10 @@ impl Index {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn clear_all(&self, writer: &mut heed::RwTxn) -> MResult<u64> {
|
||||
update::push_clear_all(writer, self.updates, self.updates_results)
|
||||
}
|
||||
|
||||
pub fn synonyms_addition(&self) -> update::SynonymsAddition {
|
||||
update::SynonymsAddition::new(
|
||||
self.updates,
|
||||
|
@ -23,6 +23,10 @@ impl PostingsLists {
|
||||
self.postings_lists.delete(writer, word)
|
||||
}
|
||||
|
||||
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
|
||||
self.postings_lists.clear(writer)
|
||||
}
|
||||
|
||||
pub fn postings_list<'txn>(
|
||||
self,
|
||||
reader: &'txn heed::RoTxn,
|
||||
|
33
meilidb-core/src/update/clear_all.rs
Normal file
33
meilidb-core/src/update/clear_all.rs
Normal file
@ -0,0 +1,33 @@
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{store, MResult, RankedMap};
|
||||
|
||||
pub fn apply_clear_all(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
||||
main_store.put_ranked_map(writer, &RankedMap::default())?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
documents_fields_store.clear(writer)?;
|
||||
documents_fields_counts_store.clear(writer)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn push_clear_all(
|
||||
writer: &mut heed::RwTxn,
|
||||
updates_store: store::Updates,
|
||||
updates_results_store: store::UpdatesResults,
|
||||
) -> MResult<u64> {
|
||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||
let update = Update::ClearAll;
|
||||
updates_store.put_update(writer, last_update_id, &update)?;
|
||||
|
||||
Ok(last_update_id)
|
||||
}
|
@ -5,7 +5,7 @@ use sdset::{duo::Union, SetOperation};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::raw_indexer::RawIndexer;
|
||||
use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
|
||||
use crate::serde::{extract_document_id, serialize_value, Serializer};
|
||||
use crate::store;
|
||||
use crate::update::{apply_documents_deletion, next_update_id, Update};
|
||||
use crate::{Error, MResult, RankedMap};
|
||||
@ -84,12 +84,9 @@ pub fn apply_documents_addition(
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
mut ranked_map: RankedMap,
|
||||
addition: Vec<serde_json::Value>,
|
||||
) -> MResult<()> {
|
||||
let mut document_ids = HashSet::new();
|
||||
let mut document_store = RamDocumentStore::new();
|
||||
let mut document_fields_counts = HashMap::new();
|
||||
let mut documents_ids = HashSet::new();
|
||||
let mut indexer = RawIndexer::new();
|
||||
|
||||
let schema = match main_store.schema(writer)? {
|
||||
@ -99,20 +96,47 @@ pub fn apply_documents_addition(
|
||||
|
||||
let identifier = schema.identifier_name();
|
||||
|
||||
// 1. store documents ids for future deletion
|
||||
for document in addition.iter() {
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
if !documents_ids.insert(document_id) {
|
||||
return Err(Error::DuplicateDocument);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. remove the documents posting lists
|
||||
let number_of_inserted_documents = documents_ids.len();
|
||||
apply_documents_deletion(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
documents_ids.into_iter().collect(),
|
||||
)?;
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
// 3. index the documents fields in the stores
|
||||
for document in addition {
|
||||
let document_id = match extract_document_id(identifier, &document)? {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::MissingDocumentId),
|
||||
};
|
||||
|
||||
// 1. store the document id for future deletion
|
||||
document_ids.insert(document_id);
|
||||
|
||||
// 2. index the document fields in ram stores
|
||||
let serializer = Serializer {
|
||||
txn: writer,
|
||||
schema: &schema,
|
||||
document_store: &mut document_store,
|
||||
document_fields_counts: &mut document_fields_counts,
|
||||
document_store: documents_fields_store,
|
||||
document_fields_counts: documents_fields_counts_store,
|
||||
indexer: &mut indexer,
|
||||
ranked_map: &mut ranked_map,
|
||||
document_id,
|
||||
@ -121,29 +145,94 @@ pub fn apply_documents_addition(
|
||||
document.serialize(serializer)?;
|
||||
}
|
||||
|
||||
// 1. remove the previous documents match indexes
|
||||
let documents_to_insert = document_ids.iter().cloned().collect();
|
||||
apply_documents_deletion(
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
ranked_map.clone(),
|
||||
documents_to_insert,
|
||||
)?;
|
||||
ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)
|
||||
}
|
||||
|
||||
// 2. insert new document attributes in the database
|
||||
for ((id, attr), value) in document_store.into_inner() {
|
||||
documents_fields_store.put_document_field(writer, id, attr, &value)?;
|
||||
pub fn reindex_all_documents(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
let schema = match main_store.schema(writer)? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let mut ranked_map = RankedMap::default();
|
||||
|
||||
// 1. retrieve all documents ids
|
||||
let mut documents_ids_to_reindex = Vec::new();
|
||||
for result in documents_fields_counts_store.documents_ids(writer)? {
|
||||
let document_id = result?;
|
||||
documents_ids_to_reindex.push(document_id);
|
||||
}
|
||||
|
||||
// 3. insert new document attributes counts
|
||||
for ((id, attr), count) in document_fields_counts {
|
||||
documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?;
|
||||
// 2. remove the documents posting lists
|
||||
let number_of_inserted_documents = documents_ids_to_reindex.len();
|
||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
||||
postings_lists_store.clear(writer)?;
|
||||
docs_words_store.clear(writer)?;
|
||||
|
||||
// 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
|
||||
let mut indexer = RawIndexer::new();
|
||||
let mut ram_store = HashMap::new();
|
||||
|
||||
for document_id in documents_ids_to_reindex {
|
||||
for result in documents_fields_store.document_fields(writer, document_id)? {
|
||||
let (attr, bytes) = result?;
|
||||
let value: serde_json::Value = serde_json::from_slice(bytes)?;
|
||||
ram_store.insert((document_id, attr), value);
|
||||
}
|
||||
|
||||
for ((docid, attr), value) in ram_store.drain() {
|
||||
serialize_value(
|
||||
writer,
|
||||
attr,
|
||||
schema.props(attr),
|
||||
docid,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
&mut indexer,
|
||||
&mut ranked_map,
|
||||
&value,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// 4. write the new index in the main store
|
||||
write_documents_addition_index(
|
||||
writer,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
ranked_map,
|
||||
number_of_inserted_documents,
|
||||
indexer,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn write_documents_addition_index(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
ranked_map: RankedMap,
|
||||
number_of_inserted_documents: usize,
|
||||
indexer: RawIndexer,
|
||||
) -> MResult<()> {
|
||||
let indexed = indexer.build();
|
||||
let mut delta_words_builder = SetBuilder::memory();
|
||||
|
||||
@ -186,9 +275,7 @@ pub fn apply_documents_addition(
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
|
||||
let inserted_documents_len = document_ids.len() as u64;
|
||||
main_store.put_number_of_documents(writer, |old| old + inserted_documents_len)?;
|
||||
main_store.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -88,7 +88,6 @@ pub fn apply_documents_deletion(
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
mut ranked_map: RankedMap,
|
||||
deletion: Vec<DocumentId>,
|
||||
) -> MResult<()> {
|
||||
let idset = SetBuf::from_dirty(deletion);
|
||||
@ -98,6 +97,11 @@ pub fn apply_documents_deletion(
|
||||
None => return Err(Error::SchemaMissing),
|
||||
};
|
||||
|
||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
// collect the ranked attributes according to the schema
|
||||
let ranked_attrs: Vec<_> = schema
|
||||
.iter()
|
||||
@ -181,7 +185,6 @@ pub fn apply_documents_deletion(
|
||||
|
||||
main_store.put_words_fst(writer, &words)?;
|
||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||
|
||||
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||
|
||||
Ok(())
|
||||
|
@ -1,3 +1,4 @@
|
||||
mod clear_all;
|
||||
mod customs_update;
|
||||
mod documents_addition;
|
||||
mod documents_deletion;
|
||||
@ -5,6 +6,7 @@ mod schema_update;
|
||||
mod synonyms_addition;
|
||||
mod synonyms_deletion;
|
||||
|
||||
pub use self::clear_all::{apply_clear_all, push_clear_all};
|
||||
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
||||
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
|
||||
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||
@ -20,11 +22,12 @@ use heed::Result as ZResult;
|
||||
use log::debug;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{store, DocumentId, MResult, RankedMap};
|
||||
use crate::{store, DocumentId, MResult};
|
||||
use meilidb_schema::Schema;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum Update {
|
||||
ClearAll,
|
||||
Schema(Schema),
|
||||
Customs(Vec<u8>),
|
||||
DocumentsAddition(Vec<serde_json::Value>),
|
||||
@ -35,6 +38,7 @@ pub enum Update {
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum UpdateType {
|
||||
ClearAll,
|
||||
Schema { schema: Schema },
|
||||
Customs,
|
||||
DocumentsAddition { number: usize },
|
||||
@ -107,13 +111,36 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
||||
debug!("Processing update number {}", update_id);
|
||||
|
||||
let (update_type, result, duration) = match update {
|
||||
Update::ClearAll => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::ClearAll;
|
||||
let result = apply_clear_all(
|
||||
writer,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
Update::Schema(schema) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let update_type = UpdateType::Schema {
|
||||
schema: schema.clone(),
|
||||
};
|
||||
let result = apply_schema_update(writer, index.main, &schema);
|
||||
let result = apply_schema_update(
|
||||
writer,
|
||||
&schema,
|
||||
index.main,
|
||||
index.documents_fields,
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
);
|
||||
|
||||
(update_type, result, start.elapsed())
|
||||
}
|
||||
@ -128,11 +155,6 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
||||
Update::DocumentsAddition(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let ranked_map = match index.main.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let update_type = UpdateType::DocumentsAddition {
|
||||
number: documents.len(),
|
||||
};
|
||||
@ -144,7 +166,6 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
ranked_map,
|
||||
documents,
|
||||
);
|
||||
|
||||
@ -153,11 +174,6 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
||||
Update::DocumentsDeletion(documents) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let ranked_map = match index.main.ranked_map(writer)? {
|
||||
Some(ranked_map) => ranked_map,
|
||||
None => RankedMap::default(),
|
||||
};
|
||||
|
||||
let update_type = UpdateType::DocumentsDeletion {
|
||||
number: documents.len(),
|
||||
};
|
||||
@ -169,7 +185,6 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
|
||||
index.documents_fields_counts,
|
||||
index.postings_lists,
|
||||
index.docs_words,
|
||||
ranked_map,
|
||||
documents,
|
||||
);
|
||||
|
||||
|
@ -1,19 +1,58 @@
|
||||
use meilidb_schema::{Diff, Schema};
|
||||
|
||||
use crate::update::documents_addition::reindex_all_documents;
|
||||
use crate::update::{next_update_id, Update};
|
||||
use crate::{error::UnsupportedOperation, store, MResult};
|
||||
use meilidb_schema::Schema;
|
||||
|
||||
pub fn apply_schema_update(
|
||||
writer: &mut heed::RwTxn,
|
||||
main_store: store::Main,
|
||||
new_schema: &Schema,
|
||||
main_store: store::Main,
|
||||
documents_fields_store: store::DocumentsFields,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
docs_words_store: store::DocsWords,
|
||||
) -> MResult<()> {
|
||||
if main_store.schema(writer)?.is_some() {
|
||||
return Err(UnsupportedOperation::SchemaAlreadyExists.into());
|
||||
use UnsupportedOperation::{
|
||||
CannotIntroduceNewSchemaAttribute, CannotRemoveSchemaAttribute,
|
||||
CannotReorderSchemaAttribute, CannotUpdateSchemaIdentifier,
|
||||
};
|
||||
|
||||
let mut need_full_reindexing = false;
|
||||
|
||||
if let Some(old_schema) = main_store.schema(writer)? {
|
||||
for diff in meilidb_schema::diff(&old_schema, new_schema) {
|
||||
match diff {
|
||||
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
|
||||
Diff::AttrMove { .. } => return Err(CannotReorderSchemaAttribute.into()),
|
||||
Diff::AttrPropsChange { old, new, .. } => {
|
||||
if new.indexed != old.indexed {
|
||||
need_full_reindexing = true;
|
||||
}
|
||||
if new.ranked != old.ranked {
|
||||
need_full_reindexing = true;
|
||||
}
|
||||
}
|
||||
Diff::NewAttr { .. } => return Err(CannotIntroduceNewSchemaAttribute.into()),
|
||||
Diff::RemovedAttr { .. } => return Err(CannotRemoveSchemaAttribute.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main_store
|
||||
.put_schema(writer, new_schema)
|
||||
.map_err(Into::into)
|
||||
main_store.put_schema(writer, new_schema)?;
|
||||
|
||||
if need_full_reindexing {
|
||||
reindex_all_documents(
|
||||
writer,
|
||||
main_store,
|
||||
documents_fields_store,
|
||||
documents_fields_counts_store,
|
||||
postings_lists_store,
|
||||
docs_words_store,
|
||||
)?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn push_schema_update(
|
||||
|
@ -215,11 +215,155 @@ impl fmt::Display for SchemaAttr {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Diff {
|
||||
IdentChange {
|
||||
old: String,
|
||||
new: String,
|
||||
},
|
||||
AttrMove {
|
||||
name: String,
|
||||
old: usize,
|
||||
new: usize,
|
||||
},
|
||||
AttrPropsChange {
|
||||
name: String,
|
||||
old: SchemaProps,
|
||||
new: SchemaProps,
|
||||
},
|
||||
NewAttr {
|
||||
name: String,
|
||||
pos: usize,
|
||||
props: SchemaProps,
|
||||
},
|
||||
RemovedAttr {
|
||||
name: String,
|
||||
},
|
||||
}
|
||||
|
||||
pub fn diff(old: &Schema, new: &Schema) -> Vec<Diff> {
|
||||
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
|
||||
|
||||
let mut differences = Vec::new();
|
||||
let old = old.to_builder();
|
||||
let new = new.to_builder();
|
||||
|
||||
// check if the old identifier differs from the new one
|
||||
if old.identifier != new.identifier {
|
||||
let old = old.identifier;
|
||||
let new = new.identifier;
|
||||
differences.push(IdentChange { old, new });
|
||||
}
|
||||
|
||||
// compare all old attributes positions
|
||||
// and properties with the new ones
|
||||
for (pos, (name, props)) in old.attributes.iter().enumerate() {
|
||||
match new.attributes.get_full(name) {
|
||||
Some((npos, _, nprops)) => {
|
||||
if pos != npos {
|
||||
let name = name.clone();
|
||||
differences.push(AttrMove {
|
||||
name,
|
||||
old: pos,
|
||||
new: npos,
|
||||
});
|
||||
}
|
||||
if props != nprops {
|
||||
let name = name.clone();
|
||||
differences.push(AttrPropsChange {
|
||||
name,
|
||||
old: *props,
|
||||
new: *nprops,
|
||||
});
|
||||
}
|
||||
}
|
||||
None => differences.push(RemovedAttr { name: name.clone() }),
|
||||
}
|
||||
}
|
||||
|
||||
// retrieve all attributes that
|
||||
// were not present in the old schema
|
||||
for (pos, (name, props)) in new.attributes.iter().enumerate() {
|
||||
if !old.attributes.contains_key(name) {
|
||||
let name = name.clone();
|
||||
differences.push(NewAttr {
|
||||
name,
|
||||
pos,
|
||||
props: *props,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
differences
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
|
||||
#[test]
|
||||
fn difference() {
|
||||
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", DISPLAYED);
|
||||
builder.new_attribute("beta", DISPLAYED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
builder.new_attribute("omega", INDEXED);
|
||||
let old = builder.build();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("kiki");
|
||||
builder.new_attribute("beta", DISPLAYED | INDEXED);
|
||||
builder.new_attribute("alpha", DISPLAYED | INDEXED);
|
||||
builder.new_attribute("delta", RANKED);
|
||||
builder.new_attribute("gamma", DISPLAYED);
|
||||
let new = builder.build();
|
||||
|
||||
let differences = diff(&old, &new);
|
||||
let expected = &[
|
||||
IdentChange {
|
||||
old: format!("id"),
|
||||
new: format!("kiki"),
|
||||
},
|
||||
AttrMove {
|
||||
name: format!("alpha"),
|
||||
old: 0,
|
||||
new: 1,
|
||||
},
|
||||
AttrPropsChange {
|
||||
name: format!("alpha"),
|
||||
old: DISPLAYED,
|
||||
new: DISPLAYED | INDEXED,
|
||||
},
|
||||
AttrMove {
|
||||
name: format!("beta"),
|
||||
old: 1,
|
||||
new: 0,
|
||||
},
|
||||
AttrMove {
|
||||
name: format!("gamma"),
|
||||
old: 2,
|
||||
new: 3,
|
||||
},
|
||||
AttrPropsChange {
|
||||
name: format!("gamma"),
|
||||
old: INDEXED,
|
||||
new: DISPLAYED,
|
||||
},
|
||||
RemovedAttr {
|
||||
name: format!("omega"),
|
||||
},
|
||||
NewAttr {
|
||||
name: format!("delta"),
|
||||
pos: 2,
|
||||
props: RANKED,
|
||||
},
|
||||
];
|
||||
|
||||
assert_eq!(&differences, expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> bincode::Result<()> {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
|
Reference in New Issue
Block a user