Compare commits

...

44 Commits

Author SHA1 Message Date
a136c62208 Merge pull request #249 from meilisearch/display-all-updates
Display enqueued along with processed updates
2019-10-31 13:53:46 +01:00
cc461b1331 Display enqueued along with processed updates 2019-10-31 12:25:52 +01:00
dbe5363672 Merge pull request #248 from meilisearch/fix-highlight-too-long
Correctly highlight when query string is too long
2019-10-30 18:19:06 +01:00
45d4361e7d Correctly highlight when query string is longer 2019-10-30 17:49:50 +01:00
b28c44cc6b Merge pull request #247 from meilisearch/bump-meilidb
Bump the meili-core/schema/tokenizer crates to 0.5.11
2019-10-30 17:48:26 +01:00
b709a7a30a Bump the meili-core/schema/tokenizer crates to 0.5.11 2019-10-30 17:40:31 +01:00
64c25bdb40 Merge pull request #246 from meilisearch/better-highlighting-area
Make the highlight system much better
2019-10-30 17:39:12 +01:00
c230f244be Make the highlight system much better 2019-10-30 17:32:29 +01:00
02af4ff113 Merge pull request #245 from meilisearch/reindex-all-documents-reduce-memory-usage
Reduce the ram consumption when re-indexing all the documents
2019-10-29 17:54:47 +01:00
4dff8a215e Reduce the ram consumption when re-indexing all the documents 2019-10-29 17:46:23 +01:00
41065305aa Merge pull request #244 from meilisearch/reintroduce-stop-words
Reintroduce stop words
2019-10-29 16:35:03 +01:00
e9dce3ce81 Add a test to ensure that the indexer support stop words 2019-10-29 16:18:06 +01:00
ff7dde7522 Make the RawIndexer support stop words 2019-10-29 16:18:06 +01:00
a226fd23c3 Introduce the stop words deletion update type 2019-10-29 16:18:06 +01:00
776673ebae Introduce the stop words addition update type 2019-10-29 15:24:09 +01:00
32d2cc3aea Merge pull request #243 from meilisearch/all-updates-results
Introduce a function to get all updates results
2019-10-29 11:45:55 +01:00
8a17fcdda5 Introduce a function to get all updates results 2019-10-29 11:37:40 +01:00
9602d7a960 Merge pull request #242 from meilisearch/accept-dup-documents
Make documents additions accept only the last duplicate document
2019-10-28 20:52:40 +01:00
ac12a4b9c9 Make documents additions accept only the last duplicate document 2019-10-28 20:40:33 +01:00
af96050944 Merge pull request #241 from meilisearch/fix-dead-locks
Fix dead locks
2019-10-28 18:20:01 +01:00
a43b37dfc1 Send channel notification when clearing documents 2019-10-28 17:58:22 +01:00
c08dcac1d4 Abort the update transaction before calling the update callback 2019-10-28 17:55:43 +01:00
a17dccd84e Merge pull request #237 from meilisearch/fix-exactness-criterion
Fix the exactness criterion algorithm
2019-10-26 18:43:10 +02:00
9a57cab3ee Fix the exactness criterion algorithm 2019-10-26 18:34:40 +02:00
751b060320 Merge pull request #238 from meilisearch/improve-highlighting
Only highlight query words areas not the whole words
2019-10-26 18:23:19 +02:00
4111b99a6d Only highlight query words areas not the whole words 2019-10-26 15:56:34 +02:00
d6fb2b56d1 Merge pull request #236 from meilisearch/reorder-automatons
Make sure that automatons group with more automatons are better
2019-10-24 15:29:16 +02:00
cb5c77e536 Make sure that automatons group with more automatons are better 2019-10-24 15:18:53 +02:00
44c89b1ea2 Merge pull request #235 from meilisearch/readme-concat-split-query-words
Add information about search concat and split query words support
2019-10-23 18:20:59 +02:00
26a285053b Add information about search concat and split query words support 2019-10-23 18:19:15 +02:00
1446a6a2d2 Merge pull request #234 from meilisearch/clear-all-update-variant
Introduce a clear all documents update
2019-10-23 16:45:37 +02:00
047eba3ff3 Introduce a clear all documents update 2019-10-23 16:39:10 +02:00
8d9d183ce6 Merge pull request #233 from meilisearch/commit-when-update-ok
Commit an update only when it is Ok
2019-10-23 16:07:48 +02:00
eb67195840 Commit an update only when it is Ok 2019-10-23 15:52:40 +02:00
93306c2326 Merge pull request #232 from meilisearch/support-splitted-words
Support splitted words
2019-10-23 13:38:16 +02:00
7d9cf8d713 Clean up the fetch algorithm 2019-10-23 12:06:21 +02:00
03eb7898e7 Introduce a basic working version of phrase query for splitting words 2019-10-23 11:40:13 +02:00
0fbd4cd632 Merge pull request #231 from meilisearch/recursive-object-indexing
Make possible to convert recursive object into strings
2019-10-22 16:20:10 +02:00
858bf359b8 Make possible to convert recursive object into strings 2019-10-22 16:02:02 +02:00
5dc8465ebd Merge pull request #181 from meilisearch/diff-schema
Make possible to update an index schema
2019-10-22 14:23:43 +02:00
0f30a221fa Introduce the reindex_all_documents indexing function 2019-10-22 14:07:27 +02:00
e86a547e93 Introduce a basic schema diff function 2019-10-21 17:57:32 +02:00
32d8b4b83f Merge pull request #230 from meilisearch/moving-to-heed
Move to heed 0.1.0
2019-10-21 13:34:06 +02:00
78535b3e33 Move to heed 0.1.0 2019-10-21 12:05:53 +02:00
38 changed files with 1568 additions and 430 deletions

View File

@ -12,6 +12,7 @@ A _full-text search database_ based on the fast [LMDB key-value store](https://e
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L283), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L265-L270) and [filter](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L246-L259) returned documents based on context defined rules
- Searches for [concatenated](https://github.com/meilisearch/MeiliDB/pull/164) and [splitted query words](https://github.com/meilisearch/MeiliDB/pull/232) to improve the search quality.
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-schema/src/lib.rs#L265-L279)
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results

View File

@ -1,6 +1,6 @@
[package]
name = "meilidb-core"
version = "0.1.0"
version = "0.5.11"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
@ -12,9 +12,10 @@ crossbeam-channel = "0.3.9"
deunicode = "1.0.0"
env_logger = "0.7.0"
hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.1.0"
log = "0.4.8"
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.5.11" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.5.11" }
once_cell = "1.2.0"
ordered-float = { version = "1.0.2", features = ["serde"] }
sdset = "0.3.3"
@ -24,11 +25,6 @@ siphasher = "0.3.0"
slice-group-by = "0.2.6"
zerocopy = "0.2.8"
[dependencies.zlmdb]
package = "zerocopy-lmdb"
git = "https://github.com/Kerollmops/zerocopy-lmdb.git"
branch = "master"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"

View File

@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::{Database, Highlight, UpdateResult};
use meilidb_core::{Database, Highlight, ProcessedUpdateResult};
use meilidb_schema::SchemaAttr;
const INDEX_NAME: &str = "default";
@ -40,7 +40,7 @@ struct IndexCommand {
#[derive(Debug, StructOpt)]
struct SearchCommand {
/// The destination where the database must be created.
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
@ -65,10 +65,18 @@ struct SearchCommand {
displayed_fields: Vec<String>,
}
#[derive(Debug, StructOpt)]
struct ShowUpdatesCommand {
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
}
#[derive(Debug, StructOpt)]
enum Command {
Index(IndexCommand),
Search(SearchCommand),
ShowUpdates(ShowUpdatesCommand),
}
impl Command {
@ -76,6 +84,7 @@ impl Command {
match self {
Command::Index(command) => &command.database_path,
Command::Search(command) => &command.database_path,
Command::ShowUpdates(command) => &command.database_path,
}
}
}
@ -88,7 +97,7 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
let start = Instant::now();
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(INDEX_NAME) {
Some(index) => index,
None => database.create_index(INDEX_NAME).unwrap(),
@ -303,6 +312,7 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
let reader = env.read_txn().unwrap();
let schema = index.main.schema(&reader)?;
reader.abort();
let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?;
let fields = command.displayed_fields.iter().map(String::as_str);
@ -418,6 +428,23 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
Ok(())
}
fn show_updates_command(
_command: ShowUpdatesCommand,
database: Database,
) -> Result<(), Box<dyn Error>> {
let env = &database.env;
let index = database
.open_index(INDEX_NAME)
.expect("Could not find index");
let reader = env.read_txn().unwrap();
let updates = index.all_updates_status(&reader)?;
println!("{:#?}", updates);
reader.abort();
Ok(())
}
fn main() -> Result<(), Box<dyn Error>> {
env_logger::init();
@ -427,5 +454,6 @@ fn main() -> Result<(), Box<dyn Error>> {
match opt {
Command::Index(command) => index_command(command, database),
Command::Search(command) => search_command(command, database),
Command::ShowUpdates(command) => show_updates_command(command, database),
}
}

View File

@ -2,7 +2,7 @@ mod dfa;
mod query_enhancer;
use std::cmp::Reverse;
use std::vec;
use std::{cmp, vec};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
@ -18,27 +18,55 @@ use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3;
pub struct AutomatonProducer {
automatons: Vec<Vec<Automaton>>,
automatons: Vec<AutomatonGroup>,
}
impl AutomatonProducer {
pub fn new(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
main_store: store::Main,
postings_list_store: store::PostingsLists,
synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
let (automatons, query_enhancer) =
generate_automatons(reader, query, main_store, synonyms_store)?;
let (automatons, query_enhancer) = generate_automatons(
reader,
query,
main_store,
postings_list_store,
synonyms_store,
)?;
Ok((AutomatonProducer { automatons }, query_enhancer))
}
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
self.automatons.into_iter()
}
}
#[derive(Debug)]
pub struct AutomatonGroup {
pub is_phrase_query: bool,
pub automatons: Vec<Automaton>,
}
impl AutomatonGroup {
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: false,
automatons,
}
}
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: true,
automatons,
}
}
}
#[derive(Debug)]
pub struct Automaton {
pub index: usize,
@ -102,12 +130,41 @@ pub fn normalize_str(string: &str) -> String {
string
}
fn split_best_frequency<'a>(
reader: &heed::RoTxn,
word: &'a str,
postings_lists_store: store::PostingsLists,
) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = postings_lists_store
.postings_list(reader, left.as_ref())?
.map_or(0, |i| i.len());
let right_freq = postings_lists_store
.postings_list(reader, right.as_ref())?
.map_or(0, |i| i.len());
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn generate_automatons(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms,
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
@ -136,7 +193,7 @@ fn generate_automatons(
original_automatons.push(automaton);
}
automatons.push(original_automatons);
automatons.push(AutomatonGroup::normal(original_automatons));
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
@ -188,13 +245,27 @@ fn generate_automatons(
Automaton::non_exact(automaton_index, n, synonym)
};
automaton_index += 1;
automatons.push(vec![automaton]);
automatons.push(AutomatonGroup::normal(vec![automaton]));
}
}
}
}
if n != 1 {
if n == 1 {
if let Some((left, right)) =
split_best_frequency(reader, &normalized, postings_lists_store)?
{
let a = Automaton::exact(automaton_index, 1, left);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
let b = Automaton::exact(automaton_index, 1, right);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
}
} else {
// automaton of concatenation of query words
let concat = ngram_slice.concat();
let normalized = normalize_str(&concat);
@ -204,16 +275,20 @@ fn generate_automatons(
let automaton = Automaton::exact(automaton_index, n, &normalized);
automaton_index += 1;
automatons.push(vec![automaton]);
automatons.push(AutomatonGroup::normal(vec![automaton]));
}
}
}
// order automatons, the most important first,
// we keep the original automatons at the front.
automatons[1..].sort_by_key(|a| {
let a = a.first().unwrap();
(Reverse(a.is_exact), a.ngram)
automatons[1..].sort_by_key(|group| {
let a = group.automatons.first().unwrap();
(
Reverse(a.is_exact),
a.ngram,
Reverse(group.automatons.len()),
)
});
Ok((automatons, enhancer_builder.build()))

View File

@ -21,16 +21,15 @@ fn number_exact_matches(
let len = group.len();
let mut found_exact = false;
for (pos, _) in is_exact[index..index + len]
.iter()
.filter(|x| **x)
.enumerate()
{
found_exact = true;
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
let (_, count) = fields_counts[pos];
if count == 1 {
return usize::max_value();
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
if *is_exact {
found_exact = true;
let attr = &attribute[index + pos];
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
let (_, count) = fields_counts[pos];
if count == 1 {
return usize::max_value();
}
}
}
}

View File

@ -5,28 +5,23 @@ use std::sync::{Arc, RwLock};
use std::{fs, thread};
use crossbeam_channel::Receiver;
use heed::types::{Str, Unit};
use heed::{CompactionOption, Result as ZResult};
use log::{debug, error};
use zlmdb::types::{Str, Unit};
use zlmdb::{CompactionOption, Result as ZResult};
use crate::{store, update, Index, MResult};
pub type BoxUpdateFn = Box<dyn Fn(update::UpdateResult) + Send + Sync + 'static>;
pub type BoxUpdateFn = Box<dyn Fn(update::ProcessedUpdateResult) + Send + Sync + 'static>;
type ArcSwapFn = arc_swap::ArcSwapOption<BoxUpdateFn>;
pub struct Database {
pub env: zlmdb::Env,
common_store: zlmdb::DynDatabase,
indexes_store: zlmdb::Database<Str, Unit>,
pub env: heed::Env,
common_store: heed::PolyDatabase,
indexes_store: heed::Database<Str, Unit>,
indexes: RwLock<HashMap<String, (Index, Arc<ArcSwapFn>, thread::JoinHandle<()>)>>,
}
fn update_awaiter(
receiver: Receiver<()>,
env: zlmdb::Env,
update_fn: Arc<ArcSwapFn>,
index: Index,
) {
fn update_awaiter(receiver: Receiver<()>, env: heed::Env, update_fn: Arc<ArcSwapFn>, index: Index) {
for () in receiver {
// consume all updates in order (oldest first)
loop {
@ -40,8 +35,13 @@ fn update_awaiter(
match update::update_task(&mut writer, index.clone()) {
Ok(Some(status)) => {
if let Err(e) = writer.commit() {
error!("update transaction failed: {}", e)
match status.result {
Ok(_) => {
if let Err(e) = writer.commit() {
error!("update transaction failed: {}", e)
}
}
Err(_) => writer.abort(),
}
if let Some(ref callback) = *update_fn.load() {
@ -67,7 +67,7 @@ impl Database {
pub fn open_or_create(path: impl AsRef<Path>) -> MResult<Database> {
fs::create_dir_all(path.as_ref())?;
let env = zlmdb::EnvOpenOptions::new()
let env = heed::EnvOpenOptions::new()
.map_size(10 * 1024 * 1024 * 1024) // 10GB
.max_dbs(3000)
.open(path)?;
@ -199,7 +199,7 @@ impl Database {
Ok(indexes.keys().cloned().collect())
}
pub fn common_store(&self) -> zlmdb::DynDatabase {
pub fn common_store(&self) -> heed::PolyDatabase {
self.common_store
}
}

View File

@ -12,7 +12,7 @@ pub enum Error {
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
Zlmdb(zlmdb::Error),
Zlmdb(heed::Error),
Fst(fst::Error),
SerdeJson(SerdeJsonError),
Bincode(bincode::Error),
@ -27,8 +27,8 @@ impl From<io::Error> for Error {
}
}
impl From<zlmdb::Error> for Error {
fn from(error: zlmdb::Error) -> Error {
impl From<heed::Error> for Error {
fn from(error: heed::Error) -> Error {
Error::Zlmdb(error)
}
}
@ -79,7 +79,7 @@ impl fmt::Display for Error {
SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"),
Zlmdb(e) => write!(f, "zlmdb error; {}", e),
Zlmdb(e) => write!(f, "heed error; {}", e),
Fst(e) => write!(f, "fst error; {}", e),
SerdeJson(e) => write!(f, "serde json error; {}", e),
Bincode(e) => write!(f, "bincode error; {}", e),
@ -95,6 +95,10 @@ impl error::Error for Error {}
#[derive(Debug)]
pub enum UnsupportedOperation {
SchemaAlreadyExists,
CannotUpdateSchemaIdentifier,
CannotReorderSchemaAttribute,
CannotIntroduceNewSchemaAttribute,
CannotRemoveSchemaAttribute,
}
impl fmt::Display for UnsupportedOperation {
@ -102,6 +106,12 @@ impl fmt::Display for UnsupportedOperation {
use self::UnsupportedOperation::*;
match self {
SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"),
CannotUpdateSchemaIdentifier => write!(f, "Cannot update the identifier of a schema"),
CannotReorderSchemaAttribute => write!(f, "Cannot reorder the attributes of a schema"),
CannotIntroduceNewSchemaAttribute => {
write!(f, "Cannot introduce new attributes in a schema")
}
CannotRemoveSchemaAttribute => write!(f, "Cannot remove attributes from a schema"),
}
}
}

View File

@ -0,0 +1,134 @@
use std::cmp::min;
use std::collections::BTreeMap;
use std::ops::{Index, IndexMut};
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array {
y_size: y,
buf: vec![value; x * y],
}
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
pub fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
let (n, m) = (source.len(), target.len());
assert!(
n <= m,
"the source string must be shorter than the target one"
);
if n == 0 {
return (m as u32, 0);
}
if m == 0 {
return (n as u32, 0);
}
if n == m && source == target {
return (0, m);
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
for i in 0..n + 1 {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
for j in 0..m + 1 {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.iter().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.iter().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = if char_s == char_t { 0 } else { 1 };
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
let mut minimum = (u32::max_value(), 0);
for x in n..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x)
}
}
minimum
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matched_length() {
let query = "Levenste";
let text = "Levenshtein";
let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
assert_eq!(dist, 1);
assert_eq!(&text[..length], "Levenshte");
}
#[test]
#[should_panic]
fn matched_length_panic() {
let query = "Levenshtein";
let text = "Levenste";
// this function will panic if source if longer than target
prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
}
}

View File

@ -7,6 +7,7 @@ pub mod criterion;
mod database;
mod distinct_map;
mod error;
mod levenshtein;
mod number;
mod query_builder;
mod ranked_map;
@ -23,7 +24,7 @@ pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{UpdateResult, UpdateStatus, UpdateType};
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
use ::serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes};

View File

@ -1,4 +1,5 @@
use hashbrown::HashMap;
use std::convert::TryFrom;
use std::mem;
use std::ops::Range;
use std::rc::Rc;
@ -8,8 +9,9 @@ use fst::{IntoStreamer, Streamer};
use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
@ -137,8 +139,8 @@ fn multiword_rewrite_matches(
}
fn fetch_raw_documents(
reader: &zlmdb::RoTxn,
automatons: &[Automaton],
reader: &heed::RoTxn,
automatons_groups: &[AutomatonGroup],
query_enhancer: &QueryEnhancer,
searchables: Option<&ReorderedAttrs>,
main_store: store::Main,
@ -148,55 +150,101 @@ fn fetch_raw_documents(
let mut matches = Vec::new();
let mut highlights = Vec::new();
for automaton in automatons {
let Automaton {
index,
is_exact,
query_len,
..
} = automaton;
let dfa = automaton.dfa();
for group in automatons_groups {
let AutomatonGroup {
is_phrase_query,
automatons,
} = group;
let phrase_query_len = automatons.len();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
let mut tmp_matches = Vec::new();
for (id, automaton) in automatons.into_iter().enumerate() {
let Automaton {
index,
is_exact,
query_len,
query,
..
} = automaton;
let dfa = automaton.dfa();
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
matches.reserve(doc_indexes.len());
highlights.reserve(doc_indexes.len());
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let covered_area = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: di.char_length,
};
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
matches.push((di.document_id, match_));
highlights.push((di.document_id, highlight));
tmp_matches.reserve(doc_indexes.len());
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: u16::try_from(covered_area).unwrap_or(u16::max_value()),
};
tmp_matches.push((di.document_id, id, match_, highlight));
}
}
}
}
if *is_phrase_query {
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
for window in group.windows(2) {
let (ida, ia, ma, ha) = window[0];
let (idb, ib, mb, hb) = window[1];
debug_assert_eq!(ida, idb);
// if matches must follow and actually follows themselves
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
// TODO we must make it work for phrase query longer than 2
// if the second match is the last phrase query word
if ib + 1 == phrase_query_len {
// insert first match
matches.push((ida, ma));
highlights.push((ida, ha));
// insert second match
matches.push((idb, mb));
highlights.push((idb, hb));
}
}
}
}
} else {
for (id, _, match_, highlight) in tmp_matches {
matches.push((id, match_));
highlights.push((id, highlight));
}
}
}
let matches = multiword_rewrite_matches(matches, &query_enhancer);
@ -285,7 +333,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn query(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
range: Range<usize>,
) -> MResult<Vec<Document>> {
@ -323,7 +371,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
}
fn raw_query<'c, FI>(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
range: Range<usize>,
@ -367,15 +415,20 @@ where
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::with_capacity(range.len());
let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
// aggregate automatons groups by groups after time
for auts in automaton_producer {
automatons.extend(auts);
automatons.push(auts);
// we must retrieve the documents associated
// with the current automatons
@ -454,7 +507,7 @@ where
}
fn raw_query_with_distinct<'c, FI, FD>(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
range: Range<usize>,
@ -480,15 +533,20 @@ where
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::new();
let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
// aggregate automatons groups by groups after time
for auts in automaton_producer {
automatons.extend(auts);
automatons.push(auts);
// we must retrieve the documents associated
// with the current automatons
@ -1697,4 +1755,68 @@ mod tests {
});
assert_matches!(iter.next(), None);
}
#[test]
fn simple_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("engine", &[doc_index(0, 1)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("engine", &[doc_index(1, 2)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
#[test]
fn harder_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("search", &[doc_index(0, 1)][..]),
("engine", &[doc_index(0, 2)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("search", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
("search", &[doc_index(1, 0)][..]),
("search", &[doc_index(1, 1)][..]),
("slow", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
}

View File

@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
@ -21,13 +22,14 @@ pub struct Indexed {
}
impl RawIndexer {
pub fn new() -> RawIndexer {
RawIndexer::with_word_limit(1000)
pub fn new(stop_words: fst::Set) -> RawIndexer {
RawIndexer::with_word_limit(stop_words, 1000)
}
pub fn with_word_limit(limit: usize) -> RawIndexer {
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
RawIndexer {
word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
@ -56,6 +58,7 @@ impl RawIndexer {
id,
attr,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
@ -87,6 +90,7 @@ impl RawIndexer {
id,
attr,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
@ -118,6 +122,7 @@ impl RawIndexer {
id,
attr,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
@ -152,17 +157,12 @@ impl RawIndexer {
}
}
impl Default for RawIndexer {
fn default() -> Self {
Self::new()
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
stop_words: &fst::Set,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool {
@ -170,16 +170,18 @@ fn index_token(
return false;
}
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
if !stop_words.contains(&token.word) {
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
None => return false,
}
None => return false,
}
true
@ -207,7 +209,7 @@ mod tests {
#[test]
fn strange_apostrophe() {
let mut indexer = RawIndexer::new();
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
@ -231,7 +233,7 @@ mod tests {
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new();
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
@ -252,4 +254,33 @@ mod tests {
.get(&"l’éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn basic_stop_words() {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(stop_words);
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes
.get(&"l’éteindre".to_owned().into_bytes())
.is_some());
}
}

View File

@ -12,8 +12,8 @@ impl ser::Serializer for ConvertToString {
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapConvertToString;
type SerializeStruct = StructConvertToString;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
@ -169,7 +169,9 @@ impl ser::Serializer for ConvertToString {
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "map" })
Ok(MapConvertToString {
text: String::new(),
})
}
fn serialize_struct(
@ -177,8 +179,8 @@ impl ser::Serializer for ConvertToString {
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "struct",
Ok(StructConvertToString {
text: String::new(),
})
}
@ -194,3 +196,63 @@ impl ser::Serializer for ConvertToString {
})
}
}
pub struct MapConvertToString {
text: String,
}
impl ser::SerializeMap for MapConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.text.push_str(&text);
self.text.push_str(" ");
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.text.push_str(&text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}
pub struct StructConvertToString {
text: String,
}
impl ser::SerializeStruct for StructConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let value = value.serialize(ConvertToString)?;
self.text.push_str(key);
self.text.push_str(" ");
self.text.push_str(&value);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}

View File

@ -14,7 +14,7 @@ use crate::DocumentId;
#[derive(Debug)]
pub enum DeserializerError {
SerdeJson(SerdeJsonError),
Zlmdb(zlmdb::Error),
Zlmdb(heed::Error),
Custom(String),
}
@ -28,7 +28,7 @@ impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::SerdeJson(e) => write!(f, "serde json related error: {}", e),
DeserializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
DeserializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
DeserializerError::Custom(s) => f.write_str(s),
}
}
@ -42,15 +42,15 @@ impl From<SerdeJsonError> for DeserializerError {
}
}
impl From<zlmdb::Error> for DeserializerError {
fn from(error: zlmdb::Error) -> DeserializerError {
impl From<heed::Error> for DeserializerError {
fn from(error: heed::Error) -> DeserializerError {
DeserializerError::Zlmdb(error)
}
}
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub reader: &'a zlmdb::RoTxn,
pub reader: &'a heed::RoTxn,
pub documents_fields: DocumentsFields,
pub schema: &'a Schema,
pub attributes: Option<&'a HashSet<SchemaAttr>>,

View File

@ -20,7 +20,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStruct = StructIndexer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
@ -302,14 +302,14 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
}
}
pub struct StructSerializer<'a> {
pub struct StructIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
impl<'a> ser::SerializeStruct for StructIndexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;

View File

@ -20,22 +20,20 @@ pub use self::convert_to_string::ConvertToString;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
pub use self::serializer::{serialize_value, Serializer};
use std::collections::BTreeMap;
use std::{error::Error, fmt};
use meilidb_schema::SchemaAttr;
use serde::ser;
use serde_json::Error as SerdeJsonError;
use crate::{DocumentId, ParseNumberError};
use crate::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
InvalidDocumentIdType,
Zlmdb(zlmdb::Error),
Zlmdb(heed::Error),
SerdeJson(SerdeJsonError),
ParseNumber(ParseNumberError),
UnserializableType { type_name: &'static str },
@ -59,7 +57,7 @@ impl fmt::Display for SerializerError {
SerializerError::InvalidDocumentIdType => {
f.write_str("document identifier can only be of type string or number")
}
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
SerializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumber(e) => {
write!(f, "error while trying to parse a number: {}", e)
@ -92,8 +90,8 @@ impl From<SerdeJsonError> for SerializerError {
}
}
impl From<zlmdb::Error> for SerializerError {
fn from(error: zlmdb::Error) -> SerializerError {
impl From<heed::Error> for SerializerError {
fn from(error: heed::Error) -> SerializerError {
SerializerError::Zlmdb(error)
}
}
@ -103,25 +101,3 @@ impl From<ParseNumberError> for SerializerError {
SerializerError::ParseNumber(error)
}
}
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
impl RamDocumentStore {
pub fn new() -> RamDocumentStore {
RamDocumentStore(BTreeMap::new())
}
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
self.0.insert((id, attr), value);
}
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
self.0
}
}
impl Default for RamDocumentStore {
fn default() -> Self {
Self::new()
}
}

View File

@ -1,17 +1,17 @@
use meilidb_schema::{Schema, SchemaAttr};
use meilidb_schema::{Schema, SchemaAttr, SchemaProps};
use serde::ser;
use std::collections::HashMap;
use crate::raw_indexer::RawIndexer;
use crate::serde::RamDocumentStore;
use crate::store::{DocumentsFields, DocumentsFieldsCounts};
use crate::{DocumentId, RankedMap};
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
pub struct Serializer<'a> {
pub txn: &'a mut heed::RwTxn,
pub schema: &'a Schema,
pub document_store: &'a mut RamDocumentStore,
pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
pub document_store: DocumentsFields,
pub document_fields_counts: DocumentsFieldsCounts,
pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId,
@ -150,6 +150,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
txn: self.txn,
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
@ -166,6 +167,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Ok(StructSerializer {
txn: self.txn,
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
@ -189,10 +191,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
}
pub struct MapSerializer<'a> {
txn: &'a mut heed::RwTxn,
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
current_key_name: Option<String>,
@ -229,17 +232,20 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
&key,
value,
)
match self.schema.attribute(&key) {
Some(attribute) => serialize_value(
self.txn,
attribute,
self.schema.props(attribute),
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
value,
),
None => Ok(()),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
@ -248,10 +254,11 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
}
pub struct StructSerializer<'a> {
txn: &'a mut heed::RwTxn,
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
}
@ -268,16 +275,20 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
where
T: ser::Serialize,
{
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
key,
value,
)
match self.schema.attribute(key) {
Some(attribute) => serialize_value(
self.txn,
attribute,
self.schema.props(attribute),
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
value,
),
None => Ok(()),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
@ -285,40 +296,42 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
}
}
fn serialize_value<T: ?Sized>(
schema: &Schema,
pub fn serialize_value<T: ?Sized>(
txn: &mut heed::RwTxn,
attribute: SchemaAttr,
props: SchemaProps,
document_id: DocumentId,
document_store: &mut RamDocumentStore,
documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
key: &str,
value: &T,
) -> Result<(), SerializerError>
where
T: ser::Serialize,
{
if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute);
let serialized = serde_json::to_vec(value)?;
document_store.put_document_field(txn, document_id, attribute, &serialized)?;
let serialized = serde_json::to_vec(value)?;
document_store.set_document_field(document_id, attribute, serialized);
if props.is_indexed() {
let indexer = Indexer {
attribute,
indexer,
if props.is_indexed() {
let indexer = Indexer {
attribute,
indexer,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.put_document_field_count(
txn,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
}
attribute,
number_of_words as u64,
)?;
}
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
Ok(())

View File

@ -1,18 +1,18 @@
use super::BEU64;
use crate::DocumentId;
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct DocsWords {
pub(crate) docs_words: zlmdb::Database<OwnedType<BEU64>, ByteSlice>,
pub(crate) docs_words: heed::Database<OwnedType<BEU64>, ByteSlice>,
}
impl DocsWords {
pub fn put_doc_words(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
words: &fst::Set,
) -> ZResult<()> {
@ -21,18 +21,18 @@ impl DocsWords {
self.docs_words.put(writer, &document_id, bytes)
}
pub fn del_doc_words(
self,
writer: &mut zlmdb::RwTxn,
document_id: DocumentId,
) -> ZResult<bool> {
pub fn del_doc_words(self, writer: &mut heed::RwTxn, document_id: DocumentId) -> ZResult<bool> {
let document_id = BEU64::new(document_id.0);
self.docs_words.delete(writer, &document_id)
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.docs_words.clear(writer)
}
pub fn doc_words(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
document_id: DocumentId,
) -> ZResult<Option<fst::Set>> {
let document_id = BEU64::new(document_id.0);

View File

@ -1,19 +1,19 @@
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
use super::DocumentAttrKey;
use crate::DocumentId;
#[derive(Copy, Clone)]
pub struct DocumentsFields {
pub(crate) documents_fields: zlmdb::Database<OwnedType<DocumentAttrKey>, ByteSlice>,
pub(crate) documents_fields: heed::Database<OwnedType<DocumentAttrKey>, ByteSlice>,
}
impl DocumentsFields {
pub fn put_document_field(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
attribute: SchemaAttr,
value: &[u8],
@ -24,7 +24,7 @@ impl DocumentsFields {
pub fn del_all_document_fields(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
@ -32,9 +32,13 @@ impl DocumentsFields {
self.documents_fields.delete_range(writer, start..=end)
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.documents_fields.clear(writer)
}
pub fn document_attribute<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<&'txn [u8]>> {
@ -44,7 +48,7 @@ impl DocumentsFields {
pub fn document_fields<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
@ -55,7 +59,7 @@ impl DocumentsFields {
}
pub struct DocumentFieldsIter<'txn> {
iter: zlmdb::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>,
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>,
}
impl<'txn> Iterator for DocumentFieldsIter<'txn> {

View File

@ -1,18 +1,18 @@
use super::DocumentAttrKey;
use crate::DocumentId;
use heed::types::OwnedType;
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr;
use zlmdb::types::OwnedType;
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts {
pub(crate) documents_fields_counts: zlmdb::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl DocumentsFieldsCounts {
pub fn put_document_field_count(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
attribute: SchemaAttr,
value: u64,
@ -23,7 +23,7 @@ impl DocumentsFieldsCounts {
pub fn del_all_document_fields_counts(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
@ -32,9 +32,13 @@ impl DocumentsFieldsCounts {
.delete_range(writer, start..=end)
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.documents_fields_counts.clear(writer)
}
pub fn document_field_count(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<u64>> {
@ -47,7 +51,7 @@ impl DocumentsFieldsCounts {
pub fn document_fields_counts<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
@ -56,10 +60,7 @@ impl DocumentsFieldsCounts {
Ok(DocumentFieldsCountsIter { iter })
}
pub fn documents_ids<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
) -> ZResult<DocumentsIdsIter<'txn>> {
pub fn documents_ids<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<DocumentsIdsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(DocumentsIdsIter {
last_seen_id: None,
@ -69,7 +70,7 @@ impl DocumentsFieldsCounts {
pub fn all_documents_fields_counts<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(AllDocumentsFieldsCountsIter { iter })
@ -77,7 +78,7 @@ impl DocumentsFieldsCounts {
}
pub struct DocumentFieldsCountsIter<'txn> {
iter: zlmdb::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl Iterator for DocumentFieldsCountsIter<'_> {
@ -97,7 +98,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
pub struct DocumentsIdsIter<'txn> {
last_seen_id: Option<DocumentId>,
iter: zlmdb::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl Iterator for DocumentsIdsIter<'_> {
@ -121,10 +122,10 @@ impl Iterator for DocumentsIdsIter<'_> {
}
pub struct AllDocumentsFieldsCountsIter<'txn> {
iter: zlmdb::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
fn next(&mut self) -> Option<Self::Item> {

View File

@ -1,28 +1,29 @@
use crate::RankedMap;
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
use heed::Result as ZResult;
use meilidb_schema::Schema;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType, Serde, Str};
use zlmdb::Result as ZResult;
const CUSTOMS_KEY: &str = "customs-key";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms";
const STOP_WORDS_KEY: &str = "stop-words";
const WORDS_KEY: &str = "words";
#[derive(Copy, Clone)]
pub struct Main {
pub(crate) main: zlmdb::DynDatabase,
pub(crate) main: heed::PolyDatabase,
}
impl Main {
pub fn put_words_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
pub fn put_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes)
}
pub fn words_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> {
pub fn words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
@ -34,31 +35,32 @@ impl Main {
}
}
pub fn put_schema(self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> {
pub fn put_schema(self, writer: &mut heed::RwTxn, schema: &Schema) -> ZResult<()> {
self.main
.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
.put::<Str, SerdeBincode<Schema>>(writer, SCHEMA_KEY, schema)
}
pub fn schema(self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> {
self.main.get::<Str, Serde<Schema>>(reader, SCHEMA_KEY)
}
pub fn put_ranked_map(self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
pub fn schema(self, reader: &heed::RoTxn) -> ZResult<Option<Schema>> {
self.main
.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
.get::<Str, SerdeBincode<Schema>>(reader, SCHEMA_KEY)
}
pub fn ranked_map(self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> {
pub fn put_ranked_map(self, writer: &mut heed::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
self.main
.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
.put::<Str, SerdeBincode<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
}
pub fn put_synonyms_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
pub fn ranked_map(self, reader: &heed::RoTxn) -> ZResult<Option<RankedMap>> {
self.main
.get::<Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
}
pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
}
pub fn synonyms_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> {
pub fn synonyms_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
@ -70,7 +72,25 @@ impl Main {
}
}
pub fn put_number_of_documents<F>(self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64>
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main
.put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
}
pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
where
F: Fn(u64) -> u64,
{
@ -80,7 +100,7 @@ impl Main {
Ok(new)
}
pub fn number_of_documents(self, reader: &zlmdb::RoTxn) -> ZResult<u64> {
pub fn number_of_documents(self, reader: &heed::RoTxn) -> ZResult<u64> {
match self
.main
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
@ -90,12 +110,12 @@ impl Main {
}
}
pub fn put_customs(self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> {
pub fn put_customs(self, writer: &mut heed::RwTxn, customs: &[u8]) -> ZResult<()> {
self.main
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
}
pub fn customs<'txn>(self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> {
pub fn customs<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<Option<&'txn [u8]>> {
self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY)
}
}

View File

@ -20,10 +20,10 @@ pub use self::updates_results::UpdatesResults;
use std::collections::HashSet;
use heed::Result as ZResult;
use meilidb_schema::{Schema, SchemaAttr};
use serde::de;
use zerocopy::{AsBytes, FromBytes};
use zlmdb::Result as ZResult;
use crate::criterion::Criteria;
use crate::serde::Deserializer;
@ -97,7 +97,7 @@ pub struct Index {
impl Index {
pub fn document<T: de::DeserializeOwned>(
&self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
attributes: Option<&HashSet<&str>>,
document_id: DocumentId,
) -> MResult<Option<T>> {
@ -127,7 +127,7 @@ impl Index {
pub fn document_attribute<T: de::DeserializeOwned>(
&self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> MResult<Option<T>> {
@ -140,12 +140,12 @@ impl Index {
}
}
pub fn schema_update(&self, writer: &mut zlmdb::RwTxn, schema: Schema) -> MResult<u64> {
pub fn schema_update(&self, writer: &mut heed::RwTxn, schema: Schema) -> MResult<u64> {
let _ = self.updates_notifier.send(());
update::push_schema_update(writer, self.updates, self.updates_results, schema)
}
pub fn customs_update(&self, writer: &mut zlmdb::RwTxn, customs: Vec<u8>) -> ZResult<u64> {
pub fn customs_update(&self, writer: &mut heed::RwTxn, customs: Vec<u8>) -> ZResult<u64> {
let _ = self.updates_notifier.send(());
update::push_customs_update(writer, self.updates, self.updates_results, customs)
}
@ -166,6 +166,11 @@ impl Index {
)
}
pub fn clear_all(&self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
update::push_clear_all(writer, self.updates, self.updates_results)
}
pub fn synonyms_addition(&self) -> update::SynonymsAddition {
update::SynonymsAddition::new(
self.updates,
@ -182,7 +187,23 @@ impl Index {
)
}
pub fn current_update_id(&self, reader: &zlmdb::RoTxn) -> MResult<Option<u64>> {
pub fn stop_words_addition(&self) -> update::StopWordsAddition {
update::StopWordsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn stop_words_deletion(&self) -> update::StopWordsDeletion {
update::StopWordsDeletion::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
match self.updates.last_update_id(reader)? {
Some((id, _)) => Ok(Some(id)),
None => Ok(None),
@ -191,12 +212,38 @@ impl Index {
pub fn update_status(
&self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
update_id: u64,
) -> MResult<update::UpdateStatus> {
update::update_status(reader, self.updates, self.updates_results, update_id)
}
pub fn all_updates_status(&self, reader: &heed::RoTxn) -> MResult<Vec<update::UpdateStatus>> {
let mut updates = Vec::new();
let mut last_update_result_id = 0;
// retrieve all updates results
if let Some((last_id, _)) = self.updates_results.last_update_id(reader)? {
updates.reserve(last_id as usize);
for id in 0..=last_id {
let update = self.update_status(reader, id)?;
updates.push(update);
last_update_result_id = id;
}
}
// retrieve all enqueued updates
if let Some((last_id, _)) = self.updates.last_update_id(reader)? {
for id in last_update_result_id + 1..last_id {
let update = self.update_status(reader, id)?;
updates.push(update);
}
}
Ok(updates)
}
pub fn query_builder(&self) -> QueryBuilder {
QueryBuilder::new(
self.main,
@ -221,7 +268,7 @@ impl Index {
}
pub fn create(
env: &zlmdb::Env,
env: &heed::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Index> {
@ -261,7 +308,7 @@ pub fn create(
}
pub fn open(
env: &zlmdb::Env,
env: &heed::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Option<Index>> {

View File

@ -1,31 +1,35 @@
use crate::DocIndex;
use heed::types::{ByteSlice, CowSlice};
use heed::Result as ZResult;
use sdset::{Set, SetBuf};
use std::borrow::Cow;
use zlmdb::types::{ByteSlice, CowSlice};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct PostingsLists {
pub(crate) postings_lists: zlmdb::Database<ByteSlice, CowSlice<DocIndex>>,
pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
}
impl PostingsLists {
pub fn put_postings_list(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
word: &[u8],
words_indexes: &Set<DocIndex>,
) -> ZResult<()> {
self.postings_lists.put(writer, word, words_indexes)
}
pub fn del_postings_list(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> {
pub fn del_postings_list(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
self.postings_lists.delete(writer, word)
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.postings_lists.clear(writer)
}
pub fn postings_list<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
word: &[u8],
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
match self.postings_lists.get(reader, word)? {

View File

@ -1,16 +1,16 @@
use heed::types::ByteSlice;
use heed::Result as ZResult;
use std::sync::Arc;
use zlmdb::types::ByteSlice;
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct Synonyms {
pub(crate) synonyms: zlmdb::Database<ByteSlice, ByteSlice>,
pub(crate) synonyms: heed::Database<ByteSlice, ByteSlice>,
}
impl Synonyms {
pub fn put_synonyms(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
word: &[u8],
synonyms: &fst::Set,
) -> ZResult<()> {
@ -18,11 +18,11 @@ impl Synonyms {
self.synonyms.put(writer, word, bytes)
}
pub fn del_synonyms(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> {
pub fn del_synonyms(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
self.synonyms.delete(writer, word)
}
pub fn synonyms(self, reader: &zlmdb::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
pub fn synonyms(self, reader: &heed::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
match self.synonyms.get(reader, word)? {
Some(bytes) => {
let len = bytes.len();

View File

@ -1,42 +1,16 @@
use super::BEU64;
use crate::update::Update;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use zlmdb::types::OwnedType;
use zlmdb::{BytesDecode, BytesEncode, Result as ZResult};
pub struct SerdeJson<T>(std::marker::PhantomData<T>);
impl<T> BytesEncode for SerdeJson<T>
where
T: Serialize,
{
type EItem = T;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
serde_json::to_vec(item).map(Cow::Owned).ok()
}
}
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T>
where
T: Deserialize<'a> + Clone,
{
type DItem = T;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
serde_json::from_slice(bytes).ok()
}
}
use heed::types::{OwnedType, SerdeJson};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct Updates {
pub(crate) updates: zlmdb::Database<OwnedType<BEU64>, SerdeJson<Update>>,
pub(crate) updates: heed::Database<OwnedType<BEU64>, SerdeJson<Update>>,
}
impl Updates {
// TODO do not trigger deserialize if possible
pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> {
pub fn last_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -44,7 +18,7 @@ impl Updates {
}
// TODO do not trigger deserialize if possible
fn first_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> {
fn first_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.first(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -52,14 +26,14 @@ impl Updates {
}
// TODO do not trigger deserialize if possible
pub fn contains(self, reader: &zlmdb::RoTxn, update_id: u64) -> ZResult<bool> {
pub fn get(self, reader: &heed::RoTxn, update_id: u64) -> ZResult<Option<Update>> {
let update_id = BEU64::new(update_id);
self.updates.get(reader, &update_id).map(|v| v.is_some())
self.updates.get(reader, &update_id)
}
pub fn put_update(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
update_id: u64,
update: &Update,
) -> ZResult<()> {
@ -68,7 +42,7 @@ impl Updates {
self.updates.put(writer, &update_id, update)
}
pub fn pop_front(self, writer: &mut zlmdb::RwTxn) -> ZResult<Option<(u64, Update)>> {
pub fn pop_front(self, writer: &mut heed::RwTxn) -> ZResult<Option<(u64, Update)>> {
match self.first_update_id(writer)? {
Some((update_id, update)) => {
let key = BEU64::new(update_id);

View File

@ -1,15 +1,19 @@
use super::BEU64;
use crate::update::UpdateResult;
use zlmdb::types::{OwnedType, Serde};
use zlmdb::Result as ZResult;
use crate::update::ProcessedUpdateResult;
use heed::types::{OwnedType, SerdeBincode};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct UpdatesResults {
pub(crate) updates_results: zlmdb::Database<OwnedType<BEU64>, Serde<UpdateResult>>,
pub(crate) updates_results:
heed::Database<OwnedType<BEU64>, SerdeBincode<ProcessedUpdateResult>>,
}
impl UpdatesResults {
pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, UpdateResult)>> {
pub fn last_update_id(
self,
reader: &heed::RoTxn,
) -> ZResult<Option<(u64, ProcessedUpdateResult)>> {
match self.updates_results.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -18,9 +22,9 @@ impl UpdatesResults {
pub fn put_update_result(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
update_id: u64,
update_result: &UpdateResult,
update_result: &ProcessedUpdateResult,
) -> ZResult<()> {
let update_id = BEU64::new(update_id);
self.updates_results.put(writer, &update_id, update_result)
@ -28,9 +32,9 @@ impl UpdatesResults {
pub fn update_result(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
update_id: u64,
) -> ZResult<Option<UpdateResult>> {
) -> ZResult<Option<ProcessedUpdateResult>> {
let update_id = BEU64::new(update_id);
self.updates_results.get(reader, &update_id)
}

View File

@ -0,0 +1,33 @@
use crate::update::{next_update_id, Update};
use crate::{store, MResult, RankedMap};
pub fn apply_clear_all(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
main_store.put_words_fst(writer, &fst::Set::default())?;
main_store.put_ranked_map(writer, &RankedMap::default())?;
main_store.put_number_of_documents(writer, |_| 0)?;
documents_fields_store.clear(writer)?;
documents_fields_counts_store.clear(writer)?;
postings_lists_store.clear(writer)?;
docs_words_store.clear(writer)?;
Ok(())
}
pub fn push_clear_all(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::ClearAll;
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}

View File

@ -1,9 +1,9 @@
use crate::store;
use crate::update::{next_update_id, Update};
use zlmdb::Result as ZResult;
use heed::Result as ZResult;
pub fn apply_customs_update(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
customs: &[u8],
) -> ZResult<()> {
@ -11,7 +11,7 @@ pub fn apply_customs_update(
}
pub fn push_customs_update(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
customs: Vec<u8>,

View File

@ -1,11 +1,11 @@
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use fst::{set::OpBuilder, SetBuilder};
use sdset::{duo::Union, SetOperation};
use serde::Serialize;
use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
use crate::serde::{extract_document_id, serialize_value, Serializer};
use crate::store;
use crate::update::{apply_documents_deletion, next_update_id, Update};
use crate::{Error, MResult, RankedMap};
@ -35,7 +35,7 @@ impl<D> DocumentsAddition<D> {
self.documents.push(document);
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64>
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64>
where
D: serde::Serialize,
{
@ -57,7 +57,7 @@ impl<D> Extend<D> for DocumentsAddition<D> {
}
pub fn push_documents_addition<D: serde::Serialize>(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: Vec<D>,
@ -78,19 +78,15 @@ pub fn push_documents_addition<D: serde::Serialize>(
}
pub fn apply_documents_addition(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
addition: Vec<serde_json::Value>,
) -> MResult<()> {
let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new();
let mut document_fields_counts = HashMap::new();
let mut indexer = RawIndexer::new();
let mut documents_additions = HashMap::new();
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
@ -99,20 +95,48 @@ pub fn apply_documents_addition(
let identifier = schema.identifier_name();
// 1. store documents ids for future deletion
for document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
// 1. store the document id for future deletion
document_ids.insert(document_id);
documents_additions.insert(document_id, document);
}
// 2. index the document fields in ram stores
// 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
documents_ids,
)?;
let mut ranked_map = match main_store.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer {
txn: writer,
schema: &schema,
document_store: &mut document_store,
document_fields_counts: &mut document_fields_counts,
document_store: documents_fields_store,
document_fields_counts: documents_fields_counts_store,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
@ -121,29 +145,103 @@ pub fn apply_documents_addition(
document.serialize(serializer)?;
}
// 1. remove the previous documents match indexes
let documents_to_insert = document_ids.iter().cloned().collect();
apply_documents_deletion(
write_documents_addition_index(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
ranked_map.clone(),
documents_to_insert,
)?;
&ranked_map,
number_of_inserted_documents,
indexer,
)
}
// 2. insert new document attributes in the database
for ((id, attr), value) in document_store.into_inner() {
documents_fields_store.put_document_field(writer, id, attr, &value)?;
pub fn reindex_all_documents(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = RankedMap::default();
// 1. retrieve all documents ids
let mut documents_ids_to_reindex = Vec::new();
for result in documents_fields_counts_store.documents_ids(writer)? {
let document_id = result?;
documents_ids_to_reindex.push(document_id);
}
// 3. insert new document attributes counts
for ((id, attr), count) in document_fields_counts {
documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?;
// 2. remove the documents posting lists
main_store.put_words_fst(writer, &fst::Set::default())?;
main_store.put_ranked_map(writer, &ranked_map)?;
main_store.put_number_of_documents(writer, |_| 0)?;
postings_lists_store.clear(writer)?;
docs_words_store.clear(writer)?;
// 3. re-index chunks of documents (otherwise we make the borrow checker unhappy)
for documents_ids in documents_ids_to_reindex.chunks(100) {
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
let number_of_inserted_documents = documents_ids.len();
let mut indexer = RawIndexer::new(stop_words);
let mut ram_store = HashMap::new();
for document_id in documents_ids {
for result in documents_fields_store.document_fields(writer, *document_id)? {
let (attr, bytes) = result?;
let value: serde_json::Value = serde_json::from_slice(bytes)?;
ram_store.insert((document_id, attr), value);
}
for ((docid, attr), value) in ram_store.drain() {
serialize_value(
writer,
attr,
schema.props(attr),
*docid,
documents_fields_store,
documents_fields_counts_store,
&mut indexer,
&mut ranked_map,
&value,
)?;
}
}
// 4. write the new index in the main store
write_documents_addition_index(
writer,
main_store,
postings_lists_store,
docs_words_store,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
}
Ok(())
}
pub fn write_documents_addition_index(
writer: &mut heed::RwTxn,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer,
) -> MResult<()> {
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
@ -185,10 +283,8 @@ pub fn apply_documents_addition(
};
main_store.put_words_fst(writer, &words)?;
main_store.put_ranked_map(writer, &ranked_map)?;
let inserted_documents_len = document_ids.len() as u64;
main_store.put_number_of_documents(writer, |old| old + inserted_documents_len)?;
main_store.put_ranked_map(writer, ranked_map)?;
main_store.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
Ok(())
}

View File

@ -49,7 +49,7 @@ impl DocumentsDeletion {
Ok(())
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_documents_deletion(
writer,
@ -68,7 +68,7 @@ impl Extend<DocumentId> for DocumentsDeletion {
}
pub fn push_documents_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: Vec<DocumentId>,
@ -82,13 +82,12 @@ pub fn push_documents_deletion(
}
pub fn apply_documents_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
deletion: Vec<DocumentId>,
) -> MResult<()> {
let idset = SetBuf::from_dirty(deletion);
@ -98,6 +97,11 @@ pub fn apply_documents_deletion(
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = match main_store.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
// collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema
.iter()
@ -181,7 +185,6 @@ pub fn apply_documents_deletion(
main_store.put_words_fst(writer, &words)?;
main_store.put_ranked_map(writer, &ranked_map)?;
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
Ok(())

View File

@ -1,70 +1,118 @@
mod clear_all;
mod customs_update;
mod documents_addition;
mod documents_deletion;
mod schema_update;
mod stop_words_addition;
mod stop_words_deletion;
mod synonyms_addition;
mod synonyms_deletion;
pub use self::clear_all::{apply_clear_all, push_clear_all};
pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion};
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::cmp;
use std::collections::BTreeMap;
use std::collections::{BTreeMap, BTreeSet};
use std::time::{Duration, Instant};
use heed::Result as ZResult;
use log::debug;
use serde::{Deserialize, Serialize};
use zlmdb::Result as ZResult;
use crate::{store, DocumentId, MResult, RankedMap};
use crate::{store, DocumentId, MResult};
use meilidb_schema::Schema;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Update {
ClearAll,
Schema(Schema),
Customs(Vec<u8>),
DocumentsAddition(Vec<serde_json::Value>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
StopWordsAddition(BTreeSet<String>),
StopWordsDeletion(BTreeSet<String>),
}
impl Update {
pub fn update_type(&self) -> UpdateType {
match self {
Update::ClearAll => UpdateType::ClearAll,
Update::Schema(schema) => UpdateType::Schema {
schema: schema.clone(),
},
Update::Customs(_) => UpdateType::Customs,
Update::DocumentsAddition(addition) => UpdateType::DocumentsAddition {
number: addition.len(),
},
Update::DocumentsDeletion(deletion) => UpdateType::DocumentsDeletion {
number: deletion.len(),
},
Update::SynonymsAddition(addition) => UpdateType::SynonymsAddition {
number: addition.len(),
},
Update::SynonymsDeletion(deletion) => UpdateType::SynonymsDeletion {
number: deletion.len(),
},
Update::StopWordsAddition(addition) => UpdateType::StopWordsAddition {
number: addition.len(),
},
Update::StopWordsDeletion(deletion) => UpdateType::StopWordsDeletion {
number: deletion.len(),
},
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateType {
ClearAll,
Schema { schema: Schema },
Customs,
DocumentsAddition { number: usize },
DocumentsDeletion { number: usize },
SynonymsAddition { number: usize },
SynonymsDeletion { number: usize },
StopWordsAddition { number: usize },
StopWordsDeletion { number: usize },
}
#[derive(Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetailedDuration {
pub main: Duration,
}
#[derive(Clone, Serialize, Deserialize)]
pub struct UpdateResult {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessedUpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
pub result: Result<(), String>,
pub detailed_duration: DetailedDuration,
}
#[derive(Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnqueuedUpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateStatus {
Enqueued,
Processed(UpdateResult),
Enqueued(EnqueuedUpdateResult),
Processed(ProcessedUpdateResult),
Unknown,
}
pub fn update_status(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
update_id: u64,
@ -72,8 +120,11 @@ pub fn update_status(
match updates_results_store.update_result(reader, update_id)? {
Some(result) => Ok(UpdateStatus::Processed(result)),
None => {
if updates_store.contains(reader, update_id)? {
Ok(UpdateStatus::Enqueued)
if let Some(update) = updates_store.get(reader, update_id)? {
Ok(UpdateStatus::Enqueued(EnqueuedUpdateResult {
update_id,
update_type: update.update_type(),
}))
} else {
Ok(UpdateStatus::Unknown)
}
@ -82,7 +133,7 @@ pub fn update_status(
}
pub fn next_update_id(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> ZResult<u64> {
@ -99,9 +150,9 @@ pub fn next_update_id(
}
pub fn update_task(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
index: store::Index,
) -> MResult<Option<UpdateResult>> {
) -> MResult<Option<ProcessedUpdateResult>> {
let (update_id, update) = match index.updates.pop_front(writer)? {
Some(value) => value,
None => return Ok(None),
@ -110,13 +161,36 @@ pub fn update_task(
debug!("Processing update number {}", update_id);
let (update_type, result, duration) = match update {
Update::ClearAll => {
let start = Instant::now();
let update_type = UpdateType::ClearAll;
let result = apply_clear_all(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
);
(update_type, result, start.elapsed())
}
Update::Schema(schema) => {
let start = Instant::now();
let update_type = UpdateType::Schema {
schema: schema.clone(),
};
let result = apply_schema_update(writer, index.main, &schema);
let result = apply_schema_update(
writer,
&schema,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
);
(update_type, result, start.elapsed())
}
@ -131,11 +205,6 @@ pub fn update_task(
Update::DocumentsAddition(documents) => {
let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsAddition {
number: documents.len(),
};
@ -147,7 +216,6 @@ pub fn update_task(
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
ranked_map,
documents,
);
@ -156,11 +224,6 @@ pub fn update_task(
Update::DocumentsDeletion(documents) => {
let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsDeletion {
number: documents.len(),
};
@ -172,7 +235,6 @@ pub fn update_task(
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
ranked_map,
documents,
);
@ -198,6 +260,37 @@ pub fn update_task(
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
}
Update::StopWordsAddition(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsAddition {
number: stop_words.len(),
};
let result =
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
(update_type, result, start.elapsed())
}
Update::StopWordsDeletion(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsDeletion {
number: stop_words.len(),
};
let result = apply_stop_words_deletion(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
stop_words,
);
(update_type, result, start.elapsed())
}
};
@ -208,7 +301,7 @@ pub fn update_task(
);
let detailed_duration = DetailedDuration { main: duration };
let status = UpdateResult {
let status = ProcessedUpdateResult {
update_id,
update_type,
result: result.map_err(|e| e.to_string()),

View File

@ -1,23 +1,62 @@
use meilidb_schema::{Diff, Schema};
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update};
use crate::{error::UnsupportedOperation, store, MResult};
use meilidb_schema::Schema;
pub fn apply_schema_update(
writer: &mut zlmdb::RwTxn,
main_store: store::Main,
writer: &mut heed::RwTxn,
new_schema: &Schema,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
if main_store.schema(writer)?.is_some() {
return Err(UnsupportedOperation::SchemaAlreadyExists.into());
use UnsupportedOperation::{
CannotIntroduceNewSchemaAttribute, CannotRemoveSchemaAttribute,
CannotReorderSchemaAttribute, CannotUpdateSchemaIdentifier,
};
let mut need_full_reindexing = false;
if let Some(old_schema) = main_store.schema(writer)? {
for diff in meilidb_schema::diff(&old_schema, new_schema) {
match diff {
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
Diff::AttrMove { .. } => return Err(CannotReorderSchemaAttribute.into()),
Diff::AttrPropsChange { old, new, .. } => {
if new.indexed != old.indexed {
need_full_reindexing = true;
}
if new.ranked != old.ranked {
need_full_reindexing = true;
}
}
Diff::NewAttr { .. } => return Err(CannotIntroduceNewSchemaAttribute.into()),
Diff::RemovedAttr { .. } => return Err(CannotRemoveSchemaAttribute.into()),
}
}
}
main_store
.put_schema(writer, new_schema)
.map_err(Into::into)
main_store.put_schema(writer, new_schema)?;
if need_full_reindexing {
reindex_all_documents(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
)?
}
Ok(())
}
pub fn push_schema_update(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
schema: Schema,

View File

@ -0,0 +1,116 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::automaton::normalize_str;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsAddition {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
stop_words: BTreeSet<String>,
}
impl StopWordsAddition {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> StopWordsAddition {
StopWordsAddition {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_stop_words_addition(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_addition(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsAddition(addition);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_addition(
writer: &mut heed::RwTxn,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
addition: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in addition {
stop_words_builder.insert(&word).unwrap();
// we remove every posting list associated to a new stop word
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// we also need to remove all the stop words from the main fst
if let Some(word_fst) = main_store.words_fst(writer)? {
let op = OpBuilder::new()
.add(&word_fst)
.add(&delta_stop_words)
.difference();
let mut word_fst_builder = SetBuilder::memory();
word_fst_builder.extend_stream(op).unwrap();
let word_fst = word_fst_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_words_fst(writer, &word_fst)?;
}
// now we add all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.r#union();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
Ok(())
}

View File

@ -0,0 +1,112 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::automaton::normalize_str;
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
stop_words: BTreeSet<String>,
}
impl StopWordsDeletion {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> StopWordsDeletion {
StopWordsDeletion {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn delete_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_stop_words_deletion(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_deletion(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsDeletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_deletion(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
deletion: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in deletion {
stop_words_builder.insert(&word).unwrap();
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// now we delete all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.difference();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
// now that we have setup the stop words
// lets reindex everything...
reindex_all_documents(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
)?;
Ok(())
}

View File

@ -42,7 +42,7 @@ impl SynonymsAddition {
.extend(alternatives);
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_synonyms_addition(
writer,
@ -55,7 +55,7 @@ impl SynonymsAddition {
}
pub fn push_synonyms_addition(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeMap<String, Vec<String>>,
@ -69,7 +69,7 @@ pub fn push_synonyms_addition(
}
pub fn apply_synonyms_addition(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
synonyms_store: store::Synonyms,
addition: BTreeMap<String, Vec<String>>,

View File

@ -49,7 +49,7 @@ impl SynonymsDeletion {
}
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_synonyms_deletion(
writer,
@ -62,7 +62,7 @@ impl SynonymsDeletion {
}
pub fn push_synonyms_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeMap<String, Option<Vec<String>>>,
@ -76,7 +76,7 @@ pub fn push_synonyms_deletion(
}
pub fn apply_synonyms_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
synonyms_store: store::Synonyms,
deletion: BTreeMap<String, Option<Vec<String>>>,

View File

@ -1,6 +1,6 @@
[package]
name = "meilidb-schema"
version = "0.1.0"
version = "0.5.11"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"

View File

@ -215,11 +215,155 @@ impl fmt::Display for SchemaAttr {
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Diff {
IdentChange {
old: String,
new: String,
},
AttrMove {
name: String,
old: usize,
new: usize,
},
AttrPropsChange {
name: String,
old: SchemaProps,
new: SchemaProps,
},
NewAttr {
name: String,
pos: usize,
props: SchemaProps,
},
RemovedAttr {
name: String,
},
}
pub fn diff(old: &Schema, new: &Schema) -> Vec<Diff> {
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
let mut differences = Vec::new();
let old = old.to_builder();
let new = new.to_builder();
// check if the old identifier differs from the new one
if old.identifier != new.identifier {
let old = old.identifier;
let new = new.identifier;
differences.push(IdentChange { old, new });
}
// compare all old attributes positions
// and properties with the new ones
for (pos, (name, props)) in old.attributes.iter().enumerate() {
match new.attributes.get_full(name) {
Some((npos, _, nprops)) => {
if pos != npos {
let name = name.clone();
differences.push(AttrMove {
name,
old: pos,
new: npos,
});
}
if props != nprops {
let name = name.clone();
differences.push(AttrPropsChange {
name,
old: *props,
new: *nprops,
});
}
}
None => differences.push(RemovedAttr { name: name.clone() }),
}
}
// retrieve all attributes that
// were not present in the old schema
for (pos, (name, props)) in new.attributes.iter().enumerate() {
if !old.attributes.contains_key(name) {
let name = name.clone();
differences.push(NewAttr {
name,
pos,
props: *props,
});
}
}
differences
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn difference() {
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
builder.new_attribute("omega", INDEXED);
let old = builder.build();
let mut builder = SchemaBuilder::with_identifier("kiki");
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("alpha", DISPLAYED | INDEXED);
builder.new_attribute("delta", RANKED);
builder.new_attribute("gamma", DISPLAYED);
let new = builder.build();
let differences = diff(&old, &new);
let expected = &[
IdentChange {
old: format!("id"),
new: format!("kiki"),
},
AttrMove {
name: format!("alpha"),
old: 0,
new: 1,
},
AttrPropsChange {
name: format!("alpha"),
old: DISPLAYED,
new: DISPLAYED | INDEXED,
},
AttrMove {
name: format!("beta"),
old: 1,
new: 0,
},
AttrMove {
name: format!("gamma"),
old: 2,
new: 3,
},
AttrPropsChange {
name: format!("gamma"),
old: INDEXED,
new: DISPLAYED,
},
RemovedAttr {
name: format!("omega"),
},
NewAttr {
name: format!("delta"),
pos: 2,
props: RANKED,
},
];
assert_eq!(&differences, expected)
}
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::with_identifier("id");

View File

@ -1,6 +1,6 @@
[package]
name = "meilidb-tokenizer"
version = "0.1.0"
version = "0.5.11"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"