Compare commits

...

77 Commits

Author SHA1 Message Date
89fd397903 Bump meilidb-core to v0.6.3 2019-11-05 15:40:04 +01:00
d8392f2f18 Merge pull request #261 from meilisearch/partial-updates
Introduce the support of partial updates
2019-11-05 15:39:02 +01:00
36b74f0efe Introduce partial updates to the update system 2019-11-05 15:23:41 +01:00
68c0a36b00 Make the deserialization support correctly optional documents 2019-11-05 15:03:18 +01:00
a127b72a74 Merge pull request #259 from meilisearch/allow-add-schema-attributes-at-end
Allow to introduce attributes only at the end of a schema
2019-11-05 12:34:11 +01:00
5782fb9e52 Test the add of attributes only at the end of a schema 2019-11-05 12:09:52 +01:00
20319f7974 Allow to introduce attributes only at the end of a schema 2019-11-05 12:09:52 +01:00
c4087e2ec2 Merge pull request #258 from meilisearch/debug-schema
Implement a better debug for the schema
2019-11-05 11:35:02 +01:00
b1d1f2f627 Implement a better debug system for the schema 2019-11-05 11:21:07 +01:00
62fe6a8263 Merge pull request #257 from meilisearch/bump-version
Bump meilidb-core/tokenizer versions
2019-11-04 17:26:01 +01:00
d88c10f3b4 Bump meilidb-tokenizer to v0.6.1 2019-11-04 17:17:06 +01:00
00f49990c7 Bump meilidb-core to v0.6.2 2019-11-04 17:16:50 +01:00
89f30ad47e Merge pull request #256 from meilisearch/fix-tokenizer
Fix the tokenizer to make it work with unicode chars
2019-11-04 17:15:17 +01:00
3b1cbed238 Check that the unidecoded words are not empty 2019-11-04 17:03:11 +01:00
4571b80a49 Update the tests 2019-11-04 16:41:58 +01:00
de2b8672d4 Make the tokenizer understand strange whitespaces/quotes 2019-11-04 16:41:58 +01:00
ccded7b429 Improve the indexer to not not deunicode before indexing
Revert of #179
2019-11-04 16:41:58 +01:00
1d4e98410a Merge pull request #255 from meilisearch/bump-version
Bump meilidb-core to v0.6.1
2019-11-04 14:47:53 +01:00
e493b27ef1 Bump meilidb-core to v0.6.1 2019-11-04 14:22:08 +01:00
70589c136f Merge pull request #253 from meilisearch/fix-updates-system
Fix the updates system
2019-11-04 13:46:37 +01:00
1c3620a7d4 Add tests to the update system 2019-11-04 13:18:07 +01:00
c2cc0704d7 Clean up the update_awaiter function 2019-11-04 11:11:58 +01:00
2a50e08bb8 Moving to heed v0.5.0 2019-11-04 10:49:27 +01:00
6b326a45d7 Fix the update system to always consume updates even if failing 2019-10-31 17:44:13 +01:00
b73874bf24 Merge pull request #252 from meilisearch/examples-specify-index-name
Allow users to specify the index name to use with examples bins
2019-10-31 17:02:00 +01:00
95c8ad0f80 Allow users to specify the index name to use with examples bins 2019-10-31 16:20:31 +01:00
996763cc52 Merge pull request #251 from meilisearch/update-heed
Moving to heed 0.3.0
2019-10-31 16:20:07 +01:00
6a8171d335 Moving to heed 0.3.0 2019-10-31 16:11:02 +01:00
2f32586dab Merge pull request #250 from meilisearch/new-http-server
Introduce a brand new HTTP server
2019-10-31 16:07:52 +01:00
db898001eb Get rid of rust-crypto and uuid 2019-10-31 15:28:37 +01:00
c2a12b661a Make it a runnable server 2019-10-31 15:27:21 +01:00
f51c49db93 Introduce the HTTP tide based library 2019-10-31 15:02:34 +01:00
1be5b0f327 Bump the meili-core/schema/tokenizer crates to 0.6.0 2019-10-31 14:05:59 +01:00
a136c62208 Merge pull request #249 from meilisearch/display-all-updates
Display enqueued along with processed updates
2019-10-31 13:53:46 +01:00
cc461b1331 Display enqueued along with processed updates 2019-10-31 12:25:52 +01:00
dbe5363672 Merge pull request #248 from meilisearch/fix-highlight-too-long
Correctly highlight when query string is too long
2019-10-30 18:19:06 +01:00
45d4361e7d Correctly highlight when query string is longer 2019-10-30 17:49:50 +01:00
b28c44cc6b Merge pull request #247 from meilisearch/bump-meilidb
Bump the meili-core/schema/tokenizer crates to 0.5.11
2019-10-30 17:48:26 +01:00
b709a7a30a Bump the meili-core/schema/tokenizer crates to 0.5.11 2019-10-30 17:40:31 +01:00
64c25bdb40 Merge pull request #246 from meilisearch/better-highlighting-area
Make the highlight system much better
2019-10-30 17:39:12 +01:00
c230f244be Make the highlight system much better 2019-10-30 17:32:29 +01:00
02af4ff113 Merge pull request #245 from meilisearch/reindex-all-documents-reduce-memory-usage
Reduce the ram consumption when re-indexing all the documents
2019-10-29 17:54:47 +01:00
4dff8a215e Reduce the ram consumption when re-indexing all the documents 2019-10-29 17:46:23 +01:00
41065305aa Merge pull request #244 from meilisearch/reintroduce-stop-words
Reintroduce stop words
2019-10-29 16:35:03 +01:00
e9dce3ce81 Add a test to ensure that the indexer support stop words 2019-10-29 16:18:06 +01:00
ff7dde7522 Make the RawIndexer support stop words 2019-10-29 16:18:06 +01:00
a226fd23c3 Introduce the stop words deletion update type 2019-10-29 16:18:06 +01:00
776673ebae Introduce the stop words addition update type 2019-10-29 15:24:09 +01:00
32d2cc3aea Merge pull request #243 from meilisearch/all-updates-results
Introduce a function to get all updates results
2019-10-29 11:45:55 +01:00
8a17fcdda5 Introduce a function to get all updates results 2019-10-29 11:37:40 +01:00
9602d7a960 Merge pull request #242 from meilisearch/accept-dup-documents
Make documents additions accept only the last duplicate document
2019-10-28 20:52:40 +01:00
ac12a4b9c9 Make documents additions accept only the last duplicate document 2019-10-28 20:40:33 +01:00
af96050944 Merge pull request #241 from meilisearch/fix-dead-locks
Fix dead locks
2019-10-28 18:20:01 +01:00
a43b37dfc1 Send channel notification when clearing documents 2019-10-28 17:58:22 +01:00
c08dcac1d4 Abort the update transaction before calling the update callback 2019-10-28 17:55:43 +01:00
a17dccd84e Merge pull request #237 from meilisearch/fix-exactness-criterion
Fix the exactness criterion algorithm
2019-10-26 18:43:10 +02:00
9a57cab3ee Fix the exactness criterion algorithm 2019-10-26 18:34:40 +02:00
751b060320 Merge pull request #238 from meilisearch/improve-highlighting
Only highlight query words areas not the whole words
2019-10-26 18:23:19 +02:00
4111b99a6d Only highlight query words areas not the whole words 2019-10-26 15:56:34 +02:00
d6fb2b56d1 Merge pull request #236 from meilisearch/reorder-automatons
Make sure that automatons group with more automatons are better
2019-10-24 15:29:16 +02:00
cb5c77e536 Make sure that automatons group with more automatons are better 2019-10-24 15:18:53 +02:00
44c89b1ea2 Merge pull request #235 from meilisearch/readme-concat-split-query-words
Add information about search concat and split query words support
2019-10-23 18:20:59 +02:00
26a285053b Add information about search concat and split query words support 2019-10-23 18:19:15 +02:00
1446a6a2d2 Merge pull request #234 from meilisearch/clear-all-update-variant
Introduce a clear all documents update
2019-10-23 16:45:37 +02:00
047eba3ff3 Introduce a clear all documents update 2019-10-23 16:39:10 +02:00
8d9d183ce6 Merge pull request #233 from meilisearch/commit-when-update-ok
Commit an update only when it is Ok
2019-10-23 16:07:48 +02:00
eb67195840 Commit an update only when it is Ok 2019-10-23 15:52:40 +02:00
93306c2326 Merge pull request #232 from meilisearch/support-splitted-words
Support splitted words
2019-10-23 13:38:16 +02:00
7d9cf8d713 Clean up the fetch algorithm 2019-10-23 12:06:21 +02:00
03eb7898e7 Introduce a basic working version of phrase query for splitting words 2019-10-23 11:40:13 +02:00
0fbd4cd632 Merge pull request #231 from meilisearch/recursive-object-indexing
Make possible to convert recursive object into strings
2019-10-22 16:20:10 +02:00
858bf359b8 Make possible to convert recursive object into strings 2019-10-22 16:02:02 +02:00
5dc8465ebd Merge pull request #181 from meilisearch/diff-schema
Make possible to update an index schema
2019-10-22 14:23:43 +02:00
0f30a221fa Introduce the reindex_all_documents indexing function 2019-10-22 14:07:27 +02:00
e86a547e93 Introduce a basic schema diff function 2019-10-21 17:57:32 +02:00
32d8b4b83f Merge pull request #230 from meilisearch/moving-to-heed
Move to heed 0.1.0
2019-10-21 13:34:06 +02:00
78535b3e33 Move to heed 0.1.0 2019-10-21 12:05:53 +02:00
64 changed files with 5689 additions and 570 deletions

View File

@ -1,6 +1,7 @@
[workspace]
members = [
"meilidb-core",
"meilidb-http",
"meilidb-schema",
"meilidb-tokenizer",
]

View File

@ -12,6 +12,7 @@ A _full-text search database_ based on the fast [LMDB key-value store](https://e
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L283), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L265-L270) and [filter](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L246-L259) returned documents based on context defined rules
- Searches for [concatenated](https://github.com/meilisearch/MeiliDB/pull/164) and [splitted query words](https://github.com/meilisearch/MeiliDB/pull/232) to improve the search quality.
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-schema/src/lib.rs#L265-L279)
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results

View File

@ -1,6 +1,6 @@
[package]
name = "meilidb-core"
version = "0.1.0"
version = "0.6.3"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
@ -12,9 +12,10 @@ crossbeam-channel = "0.3.9"
deunicode = "1.0.0"
env_logger = "0.7.0"
hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.5.0"
log = "0.4.8"
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.6.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.6.0" }
once_cell = "1.2.0"
ordered-float = { version = "1.0.2", features = ["serde"] }
sdset = "0.3.3"
@ -24,11 +25,6 @@ siphasher = "0.3.0"
slice-group-by = "0.2.6"
zerocopy = "0.2.8"
[dependencies.zlmdb]
package = "zerocopy-lmdb"
git = "https://github.com/Kerollmops/zerocopy-lmdb.git"
branch = "master"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"

View File

@ -12,17 +12,18 @@ use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::{Database, Highlight, UpdateResult};
use meilidb_core::{Database, Highlight, ProcessedUpdateResult};
use meilidb_schema::SchemaAttr;
const INDEX_NAME: &str = "default";
#[derive(Debug, StructOpt)]
struct IndexCommand {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_name: String,
/// The csv file to index.
#[structopt(parse(from_os_str))]
csv_data_path: PathBuf,
@ -40,10 +41,13 @@ struct IndexCommand {
#[derive(Debug, StructOpt)]
struct SearchCommand {
/// The destination where the database must be created.
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_name: String,
/// Timeout after which the search will return results.
#[structopt(long)]
fetch_timeout_ms: Option<u64>,
@ -65,10 +69,21 @@ struct SearchCommand {
displayed_fields: Vec<String>,
}
#[derive(Debug, StructOpt)]
struct ShowUpdatesCommand {
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_name: String,
}
#[derive(Debug, StructOpt)]
enum Command {
Index(IndexCommand),
Search(SearchCommand),
ShowUpdates(ShowUpdatesCommand),
}
impl Command {
@ -76,6 +91,7 @@ impl Command {
match self {
Command::Index(command) => &command.database_path,
Command::Search(command) => &command.database_path,
Command::ShowUpdates(command) => &command.database_path,
}
}
}
@ -88,13 +104,13 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
let start = Instant::now();
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(INDEX_NAME) {
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(&command.index_name) {
Some(index) => index,
None => database.create_index(INDEX_NAME).unwrap(),
None => database.create_index(&command.index_name).unwrap(),
};
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn));
let done = database.set_update_callback(&command.index_name, Box::new(update_fn));
assert!(done, "could not set the index update function");
let env = &database.env;
@ -297,12 +313,13 @@ fn crop_text(
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
let env = &database.env;
let index = database
.open_index(INDEX_NAME)
.open_index(&command.index_name)
.expect("Could not find index");
let reader = env.read_txn().unwrap();
let schema = index.main.schema(&reader)?;
reader.abort();
let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?;
let fields = command.displayed_fields.iter().map(String::as_str);
@ -418,6 +435,23 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
Ok(())
}
fn show_updates_command(
command: ShowUpdatesCommand,
database: Database,
) -> Result<(), Box<dyn Error>> {
let env = &database.env;
let index = database
.open_index(&command.index_name)
.expect("Could not find index");
let reader = env.read_txn().unwrap();
let updates = index.all_updates_status(&reader)?;
println!("{:#?}", updates);
reader.abort();
Ok(())
}
fn main() -> Result<(), Box<dyn Error>> {
env_logger::init();
@ -427,5 +461,6 @@ fn main() -> Result<(), Box<dyn Error>> {
match opt {
Command::Index(command) => index_command(command, database),
Command::Search(command) => search_command(command, database),
Command::ShowUpdates(command) => show_updates_command(command, database),
}
}

View File

@ -2,7 +2,7 @@ mod dfa;
mod query_enhancer;
use std::cmp::Reverse;
use std::vec;
use std::{cmp, vec};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
@ -18,27 +18,55 @@ use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3;
pub struct AutomatonProducer {
automatons: Vec<Vec<Automaton>>,
automatons: Vec<AutomatonGroup>,
}
impl AutomatonProducer {
pub fn new(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
main_store: store::Main,
postings_list_store: store::PostingsLists,
synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
let (automatons, query_enhancer) =
generate_automatons(reader, query, main_store, synonyms_store)?;
let (automatons, query_enhancer) = generate_automatons(
reader,
query,
main_store,
postings_list_store,
synonyms_store,
)?;
Ok((AutomatonProducer { automatons }, query_enhancer))
}
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
self.automatons.into_iter()
}
}
#[derive(Debug)]
pub struct AutomatonGroup {
pub is_phrase_query: bool,
pub automatons: Vec<Automaton>,
}
impl AutomatonGroup {
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: false,
automatons,
}
}
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: true,
automatons,
}
}
}
#[derive(Debug)]
pub struct Automaton {
pub index: usize,
@ -102,12 +130,41 @@ pub fn normalize_str(string: &str) -> String {
string
}
fn split_best_frequency<'a>(
reader: &heed::RoTxn,
word: &'a str,
postings_lists_store: store::PostingsLists,
) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = postings_lists_store
.postings_list(reader, left.as_ref())?
.map_or(0, |i| i.len());
let right_freq = postings_lists_store
.postings_list(reader, right.as_ref())?
.map_or(0, |i| i.len());
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn generate_automatons(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms,
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
@ -136,7 +193,7 @@ fn generate_automatons(
original_automatons.push(automaton);
}
automatons.push(original_automatons);
automatons.push(AutomatonGroup::normal(original_automatons));
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
@ -188,13 +245,27 @@ fn generate_automatons(
Automaton::non_exact(automaton_index, n, synonym)
};
automaton_index += 1;
automatons.push(vec![automaton]);
automatons.push(AutomatonGroup::normal(vec![automaton]));
}
}
}
}
if n != 1 {
if n == 1 {
if let Some((left, right)) =
split_best_frequency(reader, &normalized, postings_lists_store)?
{
let a = Automaton::exact(automaton_index, 1, left);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
let b = Automaton::exact(automaton_index, 1, right);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
}
} else {
// automaton of concatenation of query words
let concat = ngram_slice.concat();
let normalized = normalize_str(&concat);
@ -204,16 +275,20 @@ fn generate_automatons(
let automaton = Automaton::exact(automaton_index, n, &normalized);
automaton_index += 1;
automatons.push(vec![automaton]);
automatons.push(AutomatonGroup::normal(vec![automaton]));
}
}
}
// order automatons, the most important first,
// we keep the original automatons at the front.
automatons[1..].sort_by_key(|a| {
let a = a.first().unwrap();
(Reverse(a.is_exact), a.ngram)
automatons[1..].sort_by_key(|group| {
let a = group.automatons.first().unwrap();
(
Reverse(a.is_exact),
a.ngram,
Reverse(group.automatons.len()),
)
});
Ok((automatons, enhancer_builder.build()))

View File

@ -21,16 +21,15 @@ fn number_exact_matches(
let len = group.len();
let mut found_exact = false;
for (pos, _) in is_exact[index..index + len]
.iter()
.filter(|x| **x)
.enumerate()
{
found_exact = true;
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
let (_, count) = fields_counts[pos];
if count == 1 {
return usize::max_value();
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
if *is_exact {
found_exact = true;
let attr = &attribute[index + pos];
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
let (_, count) = fields_counts[pos];
if count == 1 {
return usize::max_value();
}
}
}
}

View File

@ -5,59 +5,78 @@ use std::sync::{Arc, RwLock};
use std::{fs, thread};
use crossbeam_channel::Receiver;
use log::{debug, error};
use zlmdb::types::{Str, Unit};
use zlmdb::{CompactionOption, Result as ZResult};
use heed::types::{Str, Unit};
use heed::{CompactionOption, Result as ZResult};
use log::debug;
use crate::{store, update, Index, MResult};
pub type BoxUpdateFn = Box<dyn Fn(update::UpdateResult) + Send + Sync + 'static>;
pub type BoxUpdateFn = Box<dyn Fn(update::ProcessedUpdateResult) + Send + Sync + 'static>;
type ArcSwapFn = arc_swap::ArcSwapOption<BoxUpdateFn>;
pub struct Database {
pub env: zlmdb::Env,
common_store: zlmdb::DynDatabase,
indexes_store: zlmdb::Database<Str, Unit>,
pub env: heed::Env,
common_store: heed::PolyDatabase,
indexes_store: heed::Database<Str, Unit>,
indexes: RwLock<HashMap<String, (Index, Arc<ArcSwapFn>, thread::JoinHandle<()>)>>,
}
fn update_awaiter(
receiver: Receiver<()>,
env: zlmdb::Env,
update_fn: Arc<ArcSwapFn>,
index: Index,
) {
macro_rules! r#break_try {
($expr:expr, $msg:tt) => {
match $expr {
core::result::Result::Ok(val) => val,
core::result::Result::Err(err) => {
log::error!(concat!($msg, ": {}"), err);
break;
}
}
};
}
fn update_awaiter(receiver: Receiver<()>, env: heed::Env, update_fn: Arc<ArcSwapFn>, index: Index) {
for () in receiver {
// consume all updates in order (oldest first)
loop {
let mut writer = match env.write_txn() {
Ok(writer) => writer,
Err(e) => {
error!("LMDB writer transaction begin failed: {}", e);
break;
}
};
// instantiate a main/parent transaction
let mut writer = break_try!(env.write_txn(), "LMDB write transaction begin failed");
match update::update_task(&mut writer, index.clone()) {
Ok(Some(status)) => {
if let Err(e) = writer.commit() {
error!("update transaction failed: {}", e)
}
if let Some(ref callback) = *update_fn.load() {
(callback)(status);
}
}
// no more updates to handle for now
Ok(None) => {
// retrieve the update that needs to be processed
let result = index.updates.pop_front(&mut writer);
let (update_id, update) = match break_try!(result, "pop front update failed") {
Some(value) => value,
None => {
debug!("no more updates");
writer.abort();
break;
}
Err(e) => {
error!("update task failed: {}", e);
writer.abort()
}
};
// instantiate a nested transaction
let result = env.nested_write_txn(&mut writer);
let mut nested_writer = break_try!(result, "LMDB nested write transaction failed");
// try to apply the update to the database using the nested transaction
let result = update::update_task(&mut nested_writer, index.clone(), update_id, update);
let status = break_try!(result, "update task failed");
// commit the nested transaction if the update was successful, abort it otherwise
if status.result.is_ok() {
break_try!(nested_writer.commit(), "commit nested transaction failed");
} else {
nested_writer.abort()
}
// write the result of the update in the updates-results store
let updates_results = index.updates_results;
let result = updates_results.put_update_result(&mut writer, update_id, &status);
// always commit the main/parent transaction, even if the update was unsuccessful
break_try!(result, "update result store commit failed");
break_try!(writer.commit(), "update parent transaction failed");
// call the user callback when the update and the result are written consistently
if let Some(ref callback) = *update_fn.load() {
(callback)(status);
}
}
}
@ -67,12 +86,12 @@ impl Database {
pub fn open_or_create(path: impl AsRef<Path>) -> MResult<Database> {
fs::create_dir_all(path.as_ref())?;
let env = zlmdb::EnvOpenOptions::new()
let env = heed::EnvOpenOptions::new()
.map_size(10 * 1024 * 1024 * 1024) // 10GB
.max_dbs(3000)
.open(path)?;
let common_store = env.create_dyn_database(Some("common"))?;
let common_store = env.create_poly_database(Some("common"))?;
let indexes_store = env.create_database::<Str, Unit>(Some("indexes"))?;
// list all indexes that needs to be opened
@ -199,7 +218,519 @@ impl Database {
Ok(indexes.keys().cloned().collect())
}
pub fn common_store(&self) -> zlmdb::DynDatabase {
pub fn common_store(&self) -> heed::PolyDatabase {
self.common_store
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::update::{ProcessedUpdateResult, UpdateStatus};
use crate::DocumentId;
use serde::de::IgnoredAny;
use std::sync::mpsc;
#[test]
fn valid_updates() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
}
#[test]
fn invalid_updates() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
let doc2 = serde_json::json!({
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
}
#[test]
fn add_schema_attributes_at_end() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let _update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
[attributes."age"]
displayed = true
indexed = true
[attributes."sex"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
reader.abort();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
"age": 21,
"sex": "Male",
});
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
"age": 23,
"sex": "Male",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
// even try to search for a document
let results = index.query_builder().query(&reader, "21 ", 0..20).unwrap();
assert_matches!(results.len(), 1);
reader.abort();
// try to introduce attributes in the middle of the schema
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
[attributes."city"]
displayed = true
indexed = true
[attributes."age"]
displayed = true
indexed = true
[attributes."sex"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
}
#[test]
fn deserialize_documents() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
// DocumentId(7900334843754999545)
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
// DocumentId(8367468610878465872)
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
let document: Option<IgnoredAny> = index.document(&reader, None, DocumentId(25)).unwrap();
assert!(document.is_none());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(7900334843754999545))
.unwrap();
assert!(document.is_some());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(8367468610878465872))
.unwrap();
assert!(document.is_some());
}
#[test]
fn partial_document_update() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."id"]
displayed = true
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
// DocumentId(7900334843754999545)
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
// DocumentId(8367468610878465872)
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
let document: Option<IgnoredAny> = index.document(&reader, None, DocumentId(25)).unwrap();
assert!(document.is_none());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(7900334843754999545))
.unwrap();
assert!(document.is_some());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(8367468610878465872))
.unwrap();
assert!(document.is_some());
reader.abort();
let mut partial_additions = index.documents_partial_addition();
// DocumentId(7900334843754999545)
let partial_doc1 = serde_json::json!({
"id": 123,
"description": "I am the new Marvin",
});
// DocumentId(8367468610878465872)
let partial_doc2 = serde_json::json!({
"id": 234,
"description": "I am the new Kevin",
});
partial_additions.update_document(partial_doc1);
partial_additions.update_document(partial_doc2);
let mut writer = env.write_txn().unwrap();
let update_id = partial_additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
let document: Option<serde_json::Value> = index
.document(&reader, None, DocumentId(7900334843754999545))
.unwrap();
let new_doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "I am the new Marvin",
});
assert_eq!(document, Some(new_doc1));
let document: Option<serde_json::Value> = index
.document(&reader, None, DocumentId(8367468610878465872))
.unwrap();
let new_doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "I am the new Kevin",
});
assert_eq!(document, Some(new_doc2));
}
}

View File

@ -12,7 +12,7 @@ pub enum Error {
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
Zlmdb(zlmdb::Error),
Zlmdb(heed::Error),
Fst(fst::Error),
SerdeJson(SerdeJsonError),
Bincode(bincode::Error),
@ -27,8 +27,8 @@ impl From<io::Error> for Error {
}
}
impl From<zlmdb::Error> for Error {
fn from(error: zlmdb::Error) -> Error {
impl From<heed::Error> for Error {
fn from(error: heed::Error) -> Error {
Error::Zlmdb(error)
}
}
@ -79,7 +79,7 @@ impl fmt::Display for Error {
SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"),
Zlmdb(e) => write!(f, "zlmdb error; {}", e),
Zlmdb(e) => write!(f, "heed error; {}", e),
Fst(e) => write!(f, "fst error; {}", e),
SerdeJson(e) => write!(f, "serde json error; {}", e),
Bincode(e) => write!(f, "bincode error; {}", e),
@ -95,6 +95,10 @@ impl error::Error for Error {}
#[derive(Debug)]
pub enum UnsupportedOperation {
SchemaAlreadyExists,
CannotUpdateSchemaIdentifier,
CannotReorderSchemaAttribute,
CanOnlyIntroduceNewSchemaAttributesAtEnd,
CannotRemoveSchemaAttribute,
}
impl fmt::Display for UnsupportedOperation {
@ -102,6 +106,12 @@ impl fmt::Display for UnsupportedOperation {
use self::UnsupportedOperation::*;
match self {
SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"),
CannotUpdateSchemaIdentifier => write!(f, "Cannot update the identifier of a schema"),
CannotReorderSchemaAttribute => write!(f, "Cannot reorder the attributes of a schema"),
CanOnlyIntroduceNewSchemaAttributesAtEnd => {
write!(f, "Can only introduce new attributes at end of a schema")
}
CannotRemoveSchemaAttribute => write!(f, "Cannot remove attributes from a schema"),
}
}
}

View File

@ -0,0 +1,134 @@
use std::cmp::min;
use std::collections::BTreeMap;
use std::ops::{Index, IndexMut};
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array {
y_size: y,
buf: vec![value; x * y],
}
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
pub fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
let (n, m) = (source.len(), target.len());
assert!(
n <= m,
"the source string must be shorter than the target one"
);
if n == 0 {
return (m as u32, 0);
}
if m == 0 {
return (n as u32, 0);
}
if n == m && source == target {
return (0, m);
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
for i in 0..n + 1 {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
for j in 0..m + 1 {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.iter().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.iter().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = if char_s == char_t { 0 } else { 1 };
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
let mut minimum = (u32::max_value(), 0);
for x in n..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x)
}
}
minimum
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matched_length() {
let query = "Levenste";
let text = "Levenshtein";
let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
assert_eq!(dist, 1);
assert_eq!(&text[..length], "Levenshte");
}
#[test]
#[should_panic]
fn matched_length_panic() {
let query = "Levenshtein";
let text = "Levenste";
// this function will panic if source if longer than target
prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
}
}

View File

@ -7,6 +7,7 @@ pub mod criterion;
mod database;
mod distinct_map;
mod error;
mod levenshtein;
mod number;
mod query_builder;
mod ranked_map;
@ -23,7 +24,7 @@ pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{UpdateResult, UpdateStatus, UpdateType};
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
use ::serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes};

View File

@ -1,4 +1,5 @@
use hashbrown::HashMap;
use std::convert::TryFrom;
use std::mem;
use std::ops::Range;
use std::rc::Rc;
@ -8,8 +9,9 @@ use fst::{IntoStreamer, Streamer};
use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
@ -137,8 +139,8 @@ fn multiword_rewrite_matches(
}
fn fetch_raw_documents(
reader: &zlmdb::RoTxn,
automatons: &[Automaton],
reader: &heed::RoTxn,
automatons_groups: &[AutomatonGroup],
query_enhancer: &QueryEnhancer,
searchables: Option<&ReorderedAttrs>,
main_store: store::Main,
@ -148,55 +150,101 @@ fn fetch_raw_documents(
let mut matches = Vec::new();
let mut highlights = Vec::new();
for automaton in automatons {
let Automaton {
index,
is_exact,
query_len,
..
} = automaton;
let dfa = automaton.dfa();
for group in automatons_groups {
let AutomatonGroup {
is_phrase_query,
automatons,
} = group;
let phrase_query_len = automatons.len();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
let mut tmp_matches = Vec::new();
for (id, automaton) in automatons.into_iter().enumerate() {
let Automaton {
index,
is_exact,
query_len,
query,
..
} = automaton;
let dfa = automaton.dfa();
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
matches.reserve(doc_indexes.len());
highlights.reserve(doc_indexes.len());
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let covered_area = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: di.char_length,
};
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
matches.push((di.document_id, match_));
highlights.push((di.document_id, highlight));
tmp_matches.reserve(doc_indexes.len());
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: u16::try_from(covered_area).unwrap_or(u16::max_value()),
};
tmp_matches.push((di.document_id, id, match_, highlight));
}
}
}
}
if *is_phrase_query {
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
for window in group.windows(2) {
let (ida, ia, ma, ha) = window[0];
let (idb, ib, mb, hb) = window[1];
debug_assert_eq!(ida, idb);
// if matches must follow and actually follows themselves
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
// TODO we must make it work for phrase query longer than 2
// if the second match is the last phrase query word
if ib + 1 == phrase_query_len {
// insert first match
matches.push((ida, ma));
highlights.push((ida, ha));
// insert second match
matches.push((idb, mb));
highlights.push((idb, hb));
}
}
}
}
} else {
for (id, _, match_, highlight) in tmp_matches {
matches.push((id, match_));
highlights.push((id, highlight));
}
}
}
let matches = multiword_rewrite_matches(matches, &query_enhancer);
@ -285,7 +333,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn query(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
range: Range<usize>,
) -> MResult<Vec<Document>> {
@ -323,7 +371,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
}
fn raw_query<'c, FI>(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
range: Range<usize>,
@ -367,15 +415,20 @@ where
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::with_capacity(range.len());
let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
// aggregate automatons groups by groups after time
for auts in automaton_producer {
automatons.extend(auts);
automatons.push(auts);
// we must retrieve the documents associated
// with the current automatons
@ -454,7 +507,7 @@ where
}
fn raw_query_with_distinct<'c, FI, FD>(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
query: &str,
range: Range<usize>,
@ -480,15 +533,20 @@ where
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::new();
let (automaton_producer, query_enhancer) =
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
// aggregate automatons groups by groups after time
for auts in automaton_producer {
automatons.extend(auts);
automatons.push(auts);
// we must retrieve the documents associated
// with the current automatons
@ -1697,4 +1755,68 @@ mod tests {
});
assert_matches!(iter.next(), None);
}
#[test]
fn simple_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("engine", &[doc_index(0, 1)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("engine", &[doc_index(1, 2)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
#[test]
fn harder_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("search", &[doc_index(0, 1)][..]),
("engine", &[doc_index(0, 2)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("search", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
("search", &[doc_index(1, 0)][..]),
("search", &[doc_index(1, 1)][..]),
("slow", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
}

View File

@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
@ -21,13 +22,14 @@ pub struct Indexed {
}
impl RawIndexer {
pub fn new() -> RawIndexer {
RawIndexer::with_word_limit(1000)
pub fn new(stop_words: fst::Set) -> RawIndexer {
RawIndexer::with_word_limit(stop_words, 1000)
}
pub fn with_word_limit(limit: usize) -> RawIndexer {
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
RawIndexer {
word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
@ -35,89 +37,40 @@ impl RawIndexer {
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
let mut number_of_words = 0;
let lowercase_text = text.to_lowercase();
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
// TODO compute the deunicoded version after the cjk check
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
Some(deunicoded)
} else {
None
};
let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter {
// we must not count 2 times the same words
number_of_words = 0;
for token in Tokenizer::new(&text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
number_of_words += 1;
}
}
number_of_words
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where
I: IntoIterator<Item = &'a str, IntoIter = IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
let iter = lowercased.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
for token in Tokenizer::new(text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
number_of_words += 1;
if !must_continue {
break;
}
}
let deunicoded: Vec<_> = lowercased
.into_iter()
.map(|lowercase_text| {
if lowercase_text.contains(is_cjk) {
return lowercase_text;
}
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded {
deunicoded
} else {
lowercase_text
}
})
.collect();
let iter = deunicoded.iter().map(|t| t.as_str());
number_of_words
}
pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where
I: IntoIterator<Item = &'a str>,
{
let iter = iter.into_iter();
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
@ -152,17 +105,12 @@ impl RawIndexer {
}
}
impl Default for RawIndexer {
fn default() -> Self {
Self::new()
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
stop_words: &fst::Set,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool {
@ -170,16 +118,46 @@ fn index_token(
return false;
}
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
let lower = token.word.to_lowercase();
let token = Token {
word: &lower,
..token
};
if !stop_words.contains(&token.word) {
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
None => return false,
}
if !lower.contains(is_cjk) {
let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower && !unidecoded.is_empty() {
let token = Token {
word: &unidecoded,
..token
};
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
None => return false,
}
}
}
None => return false,
}
true
@ -207,7 +185,7 @@ mod tests {
#[test]
fn strange_apostrophe() {
let mut indexer = RawIndexer::new();
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
@ -222,16 +200,14 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new();
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
@ -246,10 +222,53 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn basic_stop_words() {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(stop_words);
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn no_empty_unidecode() {
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "🇯🇵";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes
.get(&"🇯🇵".to_owned().into_bytes())
.is_some());
}
}

View File

@ -12,8 +12,8 @@ impl ser::Serializer for ConvertToString {
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapConvertToString;
type SerializeStruct = StructConvertToString;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
@ -169,7 +169,9 @@ impl ser::Serializer for ConvertToString {
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "map" })
Ok(MapConvertToString {
text: String::new(),
})
}
fn serialize_struct(
@ -177,8 +179,8 @@ impl ser::Serializer for ConvertToString {
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "struct",
Ok(StructConvertToString {
text: String::new(),
})
}
@ -194,3 +196,63 @@ impl ser::Serializer for ConvertToString {
})
}
}
pub struct MapConvertToString {
text: String,
}
impl ser::SerializeMap for MapConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.text.push_str(&text);
self.text.push_str(" ");
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.text.push_str(&text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}
pub struct StructConvertToString {
text: String,
}
impl ser::SerializeStruct for StructConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let value = value.serialize(ConvertToString)?;
self.text.push_str(key);
self.text.push_str(" ");
self.text.push_str(&value);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}

View File

@ -14,7 +14,7 @@ use crate::DocumentId;
#[derive(Debug)]
pub enum DeserializerError {
SerdeJson(SerdeJsonError),
Zlmdb(zlmdb::Error),
Zlmdb(heed::Error),
Custom(String),
}
@ -28,7 +28,7 @@ impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::SerdeJson(e) => write!(f, "serde json related error: {}", e),
DeserializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
DeserializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
DeserializerError::Custom(s) => f.write_str(s),
}
}
@ -42,15 +42,15 @@ impl From<SerdeJsonError> for DeserializerError {
}
}
impl From<zlmdb::Error> for DeserializerError {
fn from(error: zlmdb::Error) -> DeserializerError {
impl From<heed::Error> for DeserializerError {
fn from(error: heed::Error) -> DeserializerError {
DeserializerError::Zlmdb(error)
}
}
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub reader: &'a zlmdb::RoTxn,
pub reader: &'a heed::RoTxn,
pub documents_fields: DocumentsFields,
pub schema: &'a Schema,
pub attributes: Option<&'a HashSet<SchemaAttr>>,
@ -63,13 +63,14 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
where
V: de::Visitor<'de>,
{
self.deserialize_map(visitor)
self.deserialize_option(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
fn deserialize_option<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where
V: de::Visitor<'de>,
{
self.deserialize_map(visitor)
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
@ -104,16 +105,29 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
}
});
let map_deserializer = de::value::MapDeserializer::new(iter);
let result = visitor
.visit_map(map_deserializer)
.map_err(DeserializerError::from);
let mut iter = iter.peekable();
let result = match iter.peek() {
Some(_) => {
let map_deserializer = de::value::MapDeserializer::new(iter);
visitor
.visit_some(map_deserializer)
.map_err(DeserializerError::from)
}
None => visitor.visit_none(),
};
match error.take() {
Some(error) => Err(error.into()),
None => result,
}
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
}
}
struct Value(SerdeJsonDeserializer<SerdeJsonIoRead<Cursor<Vec<u8>>>>);

View File

@ -20,7 +20,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStruct = StructIndexer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
@ -302,14 +302,14 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
}
}
pub struct StructSerializer<'a> {
pub struct StructIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
impl<'a> ser::SerializeStruct for StructIndexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;

View File

@ -20,22 +20,20 @@ pub use self::convert_to_string::ConvertToString;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
pub use self::serializer::{serialize_value, Serializer};
use std::collections::BTreeMap;
use std::{error::Error, fmt};
use meilidb_schema::SchemaAttr;
use serde::ser;
use serde_json::Error as SerdeJsonError;
use crate::{DocumentId, ParseNumberError};
use crate::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
InvalidDocumentIdType,
Zlmdb(zlmdb::Error),
Zlmdb(heed::Error),
SerdeJson(SerdeJsonError),
ParseNumber(ParseNumberError),
UnserializableType { type_name: &'static str },
@ -59,7 +57,7 @@ impl fmt::Display for SerializerError {
SerializerError::InvalidDocumentIdType => {
f.write_str("document identifier can only be of type string or number")
}
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
SerializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumber(e) => {
write!(f, "error while trying to parse a number: {}", e)
@ -92,8 +90,8 @@ impl From<SerdeJsonError> for SerializerError {
}
}
impl From<zlmdb::Error> for SerializerError {
fn from(error: zlmdb::Error) -> SerializerError {
impl From<heed::Error> for SerializerError {
fn from(error: heed::Error) -> SerializerError {
SerializerError::Zlmdb(error)
}
}
@ -103,25 +101,3 @@ impl From<ParseNumberError> for SerializerError {
SerializerError::ParseNumber(error)
}
}
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
impl RamDocumentStore {
pub fn new() -> RamDocumentStore {
RamDocumentStore(BTreeMap::new())
}
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
self.0.insert((id, attr), value);
}
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
self.0
}
}
impl Default for RamDocumentStore {
fn default() -> Self {
Self::new()
}
}

View File

@ -1,31 +1,31 @@
use meilidb_schema::{Schema, SchemaAttr};
use meilidb_schema::{Schema, SchemaAttr, SchemaProps};
use serde::ser;
use std::collections::HashMap;
use crate::raw_indexer::RawIndexer;
use crate::serde::RamDocumentStore;
use crate::store::{DocumentsFields, DocumentsFieldsCounts};
use crate::{DocumentId, RankedMap};
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
pub struct Serializer<'a> {
pub struct Serializer<'a, 'b> {
pub txn: &'a mut heed::RwTxn<'b>,
pub schema: &'a Schema,
pub document_store: &'a mut RamDocumentStore,
pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
pub document_store: DocumentsFields,
pub document_fields_counts: DocumentsFieldsCounts,
pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Serializer<'a> {
impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeMap = MapSerializer<'a, 'b>;
type SerializeStruct = StructSerializer<'a, 'b>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
@ -150,6 +150,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
txn: self.txn,
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
@ -166,6 +167,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Ok(StructSerializer {
txn: self.txn,
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
@ -188,17 +190,18 @@ impl<'a> ser::Serializer for Serializer<'a> {
}
}
pub struct MapSerializer<'a> {
pub struct MapSerializer<'a, 'b> {
txn: &'a mut heed::RwTxn<'b>,
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for MapSerializer<'a> {
impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> {
type Ok = ();
type Error = SerializerError;
@ -229,17 +232,20 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
&key,
value,
)
match self.schema.attribute(&key) {
Some(attribute) => serialize_value(
self.txn,
attribute,
self.schema.props(attribute),
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
value,
),
None => Ok(()),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
@ -247,16 +253,17 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
}
}
pub struct StructSerializer<'a> {
pub struct StructSerializer<'a, 'b> {
txn: &'a mut heed::RwTxn<'b>,
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> {
type Ok = ();
type Error = SerializerError;
@ -268,16 +275,20 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
where
T: ser::Serialize,
{
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
key,
value,
)
match self.schema.attribute(key) {
Some(attribute) => serialize_value(
self.txn,
attribute,
self.schema.props(attribute),
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
value,
),
None => Ok(()),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
@ -285,40 +296,42 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
}
}
fn serialize_value<T: ?Sized>(
schema: &Schema,
pub fn serialize_value<T: ?Sized>(
txn: &mut heed::RwTxn,
attribute: SchemaAttr,
props: SchemaProps,
document_id: DocumentId,
document_store: &mut RamDocumentStore,
documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
key: &str,
value: &T,
) -> Result<(), SerializerError>
where
T: ser::Serialize,
{
if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute);
let serialized = serde_json::to_vec(value)?;
document_store.put_document_field(txn, document_id, attribute, &serialized)?;
let serialized = serde_json::to_vec(value)?;
document_store.set_document_field(document_id, attribute, serialized);
if props.is_indexed() {
let indexer = Indexer {
attribute,
indexer,
if props.is_indexed() {
let indexer = Indexer {
attribute,
indexer,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.put_document_field_count(
txn,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
}
attribute,
number_of_words as u64,
)?;
}
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
Ok(())

View File

@ -1,18 +1,18 @@
use super::BEU64;
use crate::DocumentId;
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct DocsWords {
pub(crate) docs_words: zlmdb::Database<OwnedType<BEU64>, ByteSlice>,
pub(crate) docs_words: heed::Database<OwnedType<BEU64>, ByteSlice>,
}
impl DocsWords {
pub fn put_doc_words(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
words: &fst::Set,
) -> ZResult<()> {
@ -21,18 +21,18 @@ impl DocsWords {
self.docs_words.put(writer, &document_id, bytes)
}
pub fn del_doc_words(
self,
writer: &mut zlmdb::RwTxn,
document_id: DocumentId,
) -> ZResult<bool> {
pub fn del_doc_words(self, writer: &mut heed::RwTxn, document_id: DocumentId) -> ZResult<bool> {
let document_id = BEU64::new(document_id.0);
self.docs_words.delete(writer, &document_id)
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.docs_words.clear(writer)
}
pub fn doc_words(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
document_id: DocumentId,
) -> ZResult<Option<fst::Set>> {
let document_id = BEU64::new(document_id.0);

View File

@ -1,19 +1,19 @@
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
use super::DocumentAttrKey;
use crate::DocumentId;
#[derive(Copy, Clone)]
pub struct DocumentsFields {
pub(crate) documents_fields: zlmdb::Database<OwnedType<DocumentAttrKey>, ByteSlice>,
pub(crate) documents_fields: heed::Database<OwnedType<DocumentAttrKey>, ByteSlice>,
}
impl DocumentsFields {
pub fn put_document_field(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
attribute: SchemaAttr,
value: &[u8],
@ -24,17 +24,21 @@ impl DocumentsFields {
pub fn del_all_document_fields(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields.delete_range(writer, start..=end)
self.documents_fields.delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.documents_fields.clear(writer)
}
pub fn document_attribute<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<&'txn [u8]>> {
@ -44,18 +48,18 @@ impl DocumentsFields {
pub fn document_fields<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields.range(reader, start..=end)?;
let iter = self.documents_fields.range(reader, &(start..=end))?;
Ok(DocumentFieldsIter { iter })
}
}
pub struct DocumentFieldsIter<'txn> {
iter: zlmdb::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>,
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>,
}
impl<'txn> Iterator for DocumentFieldsIter<'txn> {

View File

@ -1,18 +1,18 @@
use super::DocumentAttrKey;
use crate::DocumentId;
use heed::types::OwnedType;
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr;
use zlmdb::types::OwnedType;
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts {
pub(crate) documents_fields_counts: zlmdb::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl DocumentsFieldsCounts {
pub fn put_document_field_count(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
attribute: SchemaAttr,
value: u64,
@ -23,18 +23,22 @@ impl DocumentsFieldsCounts {
pub fn del_all_document_fields_counts(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields_counts
.delete_range(writer, start..=end)
.delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.documents_fields_counts.clear(writer)
}
pub fn document_field_count(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<u64>> {
@ -47,19 +51,16 @@ impl DocumentsFieldsCounts {
pub fn document_fields_counts<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields_counts.range(reader, start..=end)?;
let iter = self.documents_fields_counts.range(reader, &(start..=end))?;
Ok(DocumentFieldsCountsIter { iter })
}
pub fn documents_ids<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
) -> ZResult<DocumentsIdsIter<'txn>> {
pub fn documents_ids<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<DocumentsIdsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(DocumentsIdsIter {
last_seen_id: None,
@ -69,7 +70,7 @@ impl DocumentsFieldsCounts {
pub fn all_documents_fields_counts<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(AllDocumentsFieldsCountsIter { iter })
@ -77,7 +78,7 @@ impl DocumentsFieldsCounts {
}
pub struct DocumentFieldsCountsIter<'txn> {
iter: zlmdb::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl Iterator for DocumentFieldsCountsIter<'_> {
@ -97,7 +98,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
pub struct DocumentsIdsIter<'txn> {
last_seen_id: Option<DocumentId>,
iter: zlmdb::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl Iterator for DocumentsIdsIter<'_> {
@ -121,10 +122,10 @@ impl Iterator for DocumentsIdsIter<'_> {
}
pub struct AllDocumentsFieldsCountsIter<'txn> {
iter: zlmdb::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
}
impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
fn next(&mut self) -> Option<Self::Item> {

View File

@ -1,28 +1,29 @@
use crate::RankedMap;
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
use heed::Result as ZResult;
use meilidb_schema::Schema;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType, Serde, Str};
use zlmdb::Result as ZResult;
const CUSTOMS_KEY: &str = "customs-key";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms";
const STOP_WORDS_KEY: &str = "stop-words";
const WORDS_KEY: &str = "words";
#[derive(Copy, Clone)]
pub struct Main {
pub(crate) main: zlmdb::DynDatabase,
pub(crate) main: heed::PolyDatabase,
}
impl Main {
pub fn put_words_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
pub fn put_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes)
}
pub fn words_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> {
pub fn words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
@ -34,31 +35,32 @@ impl Main {
}
}
pub fn put_schema(self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> {
pub fn put_schema(self, writer: &mut heed::RwTxn, schema: &Schema) -> ZResult<()> {
self.main
.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
.put::<Str, SerdeBincode<Schema>>(writer, SCHEMA_KEY, schema)
}
pub fn schema(self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> {
self.main.get::<Str, Serde<Schema>>(reader, SCHEMA_KEY)
}
pub fn put_ranked_map(self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
pub fn schema(self, reader: &heed::RoTxn) -> ZResult<Option<Schema>> {
self.main
.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
.get::<Str, SerdeBincode<Schema>>(reader, SCHEMA_KEY)
}
pub fn ranked_map(self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> {
pub fn put_ranked_map(self, writer: &mut heed::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
self.main
.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
.put::<Str, SerdeBincode<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
}
pub fn put_synonyms_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
pub fn ranked_map(self, reader: &heed::RoTxn) -> ZResult<Option<RankedMap>> {
self.main
.get::<Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
}
pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
}
pub fn synonyms_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> {
pub fn synonyms_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
@ -70,7 +72,25 @@ impl Main {
}
}
pub fn put_number_of_documents<F>(self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64>
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main
.put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
}
pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
where
F: Fn(u64) -> u64,
{
@ -80,7 +100,7 @@ impl Main {
Ok(new)
}
pub fn number_of_documents(self, reader: &zlmdb::RoTxn) -> ZResult<u64> {
pub fn number_of_documents(self, reader: &heed::RoTxn) -> ZResult<u64> {
match self
.main
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
@ -90,12 +110,12 @@ impl Main {
}
}
pub fn put_customs(self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> {
pub fn put_customs(self, writer: &mut heed::RwTxn, customs: &[u8]) -> ZResult<()> {
self.main
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
}
pub fn customs<'txn>(self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> {
pub fn customs<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<Option<&'txn [u8]>> {
self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY)
}
}

View File

@ -20,10 +20,10 @@ pub use self::updates_results::UpdatesResults;
use std::collections::HashSet;
use heed::Result as ZResult;
use meilidb_schema::{Schema, SchemaAttr};
use serde::de;
use serde::de::{self, Deserialize};
use zerocopy::{AsBytes, FromBytes};
use zlmdb::Result as ZResult;
use crate::criterion::Criteria;
use crate::serde::Deserializer;
@ -97,7 +97,7 @@ pub struct Index {
impl Index {
pub fn document<T: de::DeserializeOwned>(
&self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
attributes: Option<&HashSet<&str>>,
document_id: DocumentId,
) -> MResult<Option<T>> {
@ -120,14 +120,12 @@ impl Index {
attributes: attributes.as_ref(),
};
// TODO: currently we return an error if all document fields are missing,
// returning None would have been better
Ok(T::deserialize(&mut deserializer).map(Some)?)
Ok(Option::<T>::deserialize(&mut deserializer)?)
}
pub fn document_attribute<T: de::DeserializeOwned>(
&self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> MResult<Option<T>> {
@ -140,12 +138,12 @@ impl Index {
}
}
pub fn schema_update(&self, writer: &mut zlmdb::RwTxn, schema: Schema) -> MResult<u64> {
pub fn schema_update(&self, writer: &mut heed::RwTxn, schema: Schema) -> MResult<u64> {
let _ = self.updates_notifier.send(());
update::push_schema_update(writer, self.updates, self.updates_results, schema)
}
pub fn customs_update(&self, writer: &mut zlmdb::RwTxn, customs: Vec<u8>) -> ZResult<u64> {
pub fn customs_update(&self, writer: &mut heed::RwTxn, customs: Vec<u8>) -> ZResult<u64> {
let _ = self.updates_notifier.send(());
update::push_customs_update(writer, self.updates, self.updates_results, customs)
}
@ -158,6 +156,14 @@ impl Index {
)
}
pub fn documents_partial_addition<D>(&self) -> update::DocumentsAddition<D> {
update::DocumentsAddition::new_partial(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn documents_deletion(&self) -> update::DocumentsDeletion {
update::DocumentsDeletion::new(
self.updates,
@ -166,6 +172,11 @@ impl Index {
)
}
pub fn clear_all(&self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
update::push_clear_all(writer, self.updates, self.updates_results)
}
pub fn synonyms_addition(&self) -> update::SynonymsAddition {
update::SynonymsAddition::new(
self.updates,
@ -182,7 +193,23 @@ impl Index {
)
}
pub fn current_update_id(&self, reader: &zlmdb::RoTxn) -> MResult<Option<u64>> {
pub fn stop_words_addition(&self) -> update::StopWordsAddition {
update::StopWordsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn stop_words_deletion(&self) -> update::StopWordsDeletion {
update::StopWordsDeletion::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
match self.updates.last_update_id(reader)? {
Some((id, _)) => Ok(Some(id)),
None => Ok(None),
@ -191,12 +218,38 @@ impl Index {
pub fn update_status(
&self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
update_id: u64,
) -> MResult<update::UpdateStatus> {
update::update_status(reader, self.updates, self.updates_results, update_id)
}
pub fn all_updates_status(&self, reader: &heed::RoTxn) -> MResult<Vec<update::UpdateStatus>> {
let mut updates = Vec::new();
let mut last_update_result_id = 0;
// retrieve all updates results
if let Some((last_id, _)) = self.updates_results.last_update_id(reader)? {
updates.reserve(last_id as usize);
for id in 0..=last_id {
let update = self.update_status(reader, id)?;
updates.push(update);
last_update_result_id = id;
}
}
// retrieve all enqueued updates
if let Some((last_id, _)) = self.updates.last_update_id(reader)? {
for id in last_update_result_id + 1..last_id {
let update = self.update_status(reader, id)?;
updates.push(update);
}
}
Ok(updates)
}
pub fn query_builder(&self) -> QueryBuilder {
QueryBuilder::new(
self.main,
@ -221,7 +274,7 @@ impl Index {
}
pub fn create(
env: &zlmdb::Env,
env: &heed::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Index> {
@ -236,7 +289,7 @@ pub fn create(
let updates_results_name = updates_results_name(name);
// open all the stores
let main = env.create_dyn_database(Some(&main_name))?;
let main = env.create_poly_database(Some(&main_name))?;
let postings_lists = env.create_database(Some(&postings_lists_name))?;
let documents_fields = env.create_database(Some(&documents_fields_name))?;
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
@ -261,7 +314,7 @@ pub fn create(
}
pub fn open(
env: &zlmdb::Env,
env: &heed::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Option<Index>> {
@ -276,7 +329,7 @@ pub fn open(
let updates_results_name = updates_results_name(name);
// open all the stores
let main = match env.open_dyn_database(Some(&main_name))? {
let main = match env.open_poly_database(Some(&main_name))? {
Some(main) => main,
None => return Ok(None),
};

View File

@ -1,31 +1,35 @@
use crate::DocIndex;
use heed::types::{ByteSlice, CowSlice};
use heed::Result as ZResult;
use sdset::{Set, SetBuf};
use std::borrow::Cow;
use zlmdb::types::{ByteSlice, CowSlice};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct PostingsLists {
pub(crate) postings_lists: zlmdb::Database<ByteSlice, CowSlice<DocIndex>>,
pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
}
impl PostingsLists {
pub fn put_postings_list(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
word: &[u8],
words_indexes: &Set<DocIndex>,
) -> ZResult<()> {
self.postings_lists.put(writer, word, words_indexes)
}
pub fn del_postings_list(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> {
pub fn del_postings_list(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
self.postings_lists.delete(writer, word)
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.postings_lists.clear(writer)
}
pub fn postings_list<'txn>(
self,
reader: &'txn zlmdb::RoTxn,
reader: &'txn heed::RoTxn,
word: &[u8],
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
match self.postings_lists.get(reader, word)? {

View File

@ -1,16 +1,16 @@
use heed::types::ByteSlice;
use heed::Result as ZResult;
use std::sync::Arc;
use zlmdb::types::ByteSlice;
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)]
pub struct Synonyms {
pub(crate) synonyms: zlmdb::Database<ByteSlice, ByteSlice>,
pub(crate) synonyms: heed::Database<ByteSlice, ByteSlice>,
}
impl Synonyms {
pub fn put_synonyms(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
word: &[u8],
synonyms: &fst::Set,
) -> ZResult<()> {
@ -18,11 +18,11 @@ impl Synonyms {
self.synonyms.put(writer, word, bytes)
}
pub fn del_synonyms(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> {
pub fn del_synonyms(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
self.synonyms.delete(writer, word)
}
pub fn synonyms(self, reader: &zlmdb::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
pub fn synonyms(self, reader: &heed::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
match self.synonyms.get(reader, word)? {
Some(bytes) => {
let len = bytes.len();

View File

@ -1,42 +1,16 @@
use super::BEU64;
use crate::update::Update;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use zlmdb::types::OwnedType;
use zlmdb::{BytesDecode, BytesEncode, Result as ZResult};
pub struct SerdeJson<T>(std::marker::PhantomData<T>);
impl<T> BytesEncode for SerdeJson<T>
where
T: Serialize,
{
type EItem = T;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
serde_json::to_vec(item).map(Cow::Owned).ok()
}
}
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T>
where
T: Deserialize<'a> + Clone,
{
type DItem = T;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
serde_json::from_slice(bytes).ok()
}
}
use heed::types::{OwnedType, SerdeJson};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct Updates {
pub(crate) updates: zlmdb::Database<OwnedType<BEU64>, SerdeJson<Update>>,
pub(crate) updates: heed::Database<OwnedType<BEU64>, SerdeJson<Update>>,
}
impl Updates {
// TODO do not trigger deserialize if possible
pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> {
pub fn last_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -44,7 +18,7 @@ impl Updates {
}
// TODO do not trigger deserialize if possible
fn first_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> {
fn first_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.first(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -52,14 +26,14 @@ impl Updates {
}
// TODO do not trigger deserialize if possible
pub fn contains(self, reader: &zlmdb::RoTxn, update_id: u64) -> ZResult<bool> {
pub fn get(self, reader: &heed::RoTxn, update_id: u64) -> ZResult<Option<Update>> {
let update_id = BEU64::new(update_id);
self.updates.get(reader, &update_id).map(|v| v.is_some())
self.updates.get(reader, &update_id)
}
pub fn put_update(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
update_id: u64,
update: &Update,
) -> ZResult<()> {
@ -68,7 +42,7 @@ impl Updates {
self.updates.put(writer, &update_id, update)
}
pub fn pop_front(self, writer: &mut zlmdb::RwTxn) -> ZResult<Option<(u64, Update)>> {
pub fn pop_front(self, writer: &mut heed::RwTxn) -> ZResult<Option<(u64, Update)>> {
match self.first_update_id(writer)? {
Some((update_id, update)) => {
let key = BEU64::new(update_id);

View File

@ -1,15 +1,19 @@
use super::BEU64;
use crate::update::UpdateResult;
use zlmdb::types::{OwnedType, Serde};
use zlmdb::Result as ZResult;
use crate::update::ProcessedUpdateResult;
use heed::types::{OwnedType, SerdeBincode};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct UpdatesResults {
pub(crate) updates_results: zlmdb::Database<OwnedType<BEU64>, Serde<UpdateResult>>,
pub(crate) updates_results:
heed::Database<OwnedType<BEU64>, SerdeBincode<ProcessedUpdateResult>>,
}
impl UpdatesResults {
pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, UpdateResult)>> {
pub fn last_update_id(
self,
reader: &heed::RoTxn,
) -> ZResult<Option<(u64, ProcessedUpdateResult)>> {
match self.updates_results.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -18,9 +22,9 @@ impl UpdatesResults {
pub fn put_update_result(
self,
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
update_id: u64,
update_result: &UpdateResult,
update_result: &ProcessedUpdateResult,
) -> ZResult<()> {
let update_id = BEU64::new(update_id);
self.updates_results.put(writer, &update_id, update_result)
@ -28,9 +32,9 @@ impl UpdatesResults {
pub fn update_result(
self,
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
update_id: u64,
) -> ZResult<Option<UpdateResult>> {
) -> ZResult<Option<ProcessedUpdateResult>> {
let update_id = BEU64::new(update_id);
self.updates_results.get(reader, &update_id)
}

View File

@ -0,0 +1,33 @@
use crate::update::{next_update_id, Update};
use crate::{store, MResult, RankedMap};
pub fn apply_clear_all(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
main_store.put_words_fst(writer, &fst::Set::default())?;
main_store.put_ranked_map(writer, &RankedMap::default())?;
main_store.put_number_of_documents(writer, |_| 0)?;
documents_fields_store.clear(writer)?;
documents_fields_counts_store.clear(writer)?;
postings_lists_store.clear(writer)?;
docs_words_store.clear(writer)?;
Ok(())
}
pub fn push_clear_all(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::ClearAll;
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}

View File

@ -1,9 +1,9 @@
use crate::store;
use crate::update::{next_update_id, Update};
use zlmdb::Result as ZResult;
use heed::Result as ZResult;
pub fn apply_customs_update(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
customs: &[u8],
) -> ZResult<()> {
@ -11,7 +11,7 @@ pub fn apply_customs_update(
}
pub fn push_customs_update(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
customs: Vec<u8>,

View File

@ -1,11 +1,11 @@
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use fst::{set::OpBuilder, SetBuilder};
use sdset::{duo::Union, SetOperation};
use serde::Serialize;
use serde::{Deserialize, Serialize};
use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
use crate::serde::{extract_document_id, serialize_value, Deserializer, Serializer};
use crate::store;
use crate::update::{apply_documents_deletion, next_update_id, Update};
use crate::{Error, MResult, RankedMap};
@ -15,6 +15,7 @@ pub struct DocumentsAddition<D> {
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
documents: Vec<D>,
is_partial: bool,
}
impl<D> DocumentsAddition<D> {
@ -28,6 +29,21 @@ impl<D> DocumentsAddition<D> {
updates_results_store,
updates_notifier,
documents: Vec::new(),
is_partial: false,
}
}
pub fn new_partial(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
updates_notifier,
documents: Vec::new(),
is_partial: true,
}
}
@ -35,7 +51,7 @@ impl<D> DocumentsAddition<D> {
self.documents.push(document);
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64>
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64>
where
D: serde::Serialize,
{
@ -45,6 +61,7 @@ impl<D> DocumentsAddition<D> {
self.updates_store,
self.updates_results_store,
self.documents,
self.is_partial,
)?;
Ok(update_id)
}
@ -57,10 +74,11 @@ impl<D> Extend<D> for DocumentsAddition<D> {
}
pub fn push_documents_addition<D: serde::Serialize>(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: Vec<D>,
is_partial: bool,
) -> MResult<u64> {
let mut values = Vec::with_capacity(addition.len());
for add in addition {
@ -71,26 +89,27 @@ pub fn push_documents_addition<D: serde::Serialize>(
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::DocumentsAddition(values);
let update = if is_partial {
Update::DocumentsPartial(values)
} else {
Update::DocumentsAddition(values)
};
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_documents_addition(
writer: &mut zlmdb::RwTxn,
pub fn apply_documents_addition<'a, 'b>(
writer: &'a mut heed::RwTxn<'b>,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
addition: Vec<serde_json::Value>,
addition: Vec<HashMap<String, serde_json::Value>>,
) -> MResult<()> {
let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new();
let mut document_fields_counts = HashMap::new();
let mut indexer = RawIndexer::new();
let mut documents_additions = HashMap::new();
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
@ -99,20 +118,48 @@ pub fn apply_documents_addition(
let identifier = schema.identifier_name();
// 1. store documents ids for future deletion
for document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
// 1. store the document id for future deletion
document_ids.insert(document_id);
documents_additions.insert(document_id, document);
}
// 2. index the document fields in ram stores
// 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
documents_ids,
)?;
let mut ranked_map = match main_store.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer {
txn: writer,
schema: &schema,
document_store: &mut document_store,
document_fields_counts: &mut document_fields_counts,
document_store: documents_fields_store,
document_fields_counts: documents_fields_counts_store,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
@ -121,8 +168,65 @@ pub fn apply_documents_addition(
document.serialize(serializer)?;
}
// 1. remove the previous documents match indexes
let documents_to_insert = document_ids.iter().cloned().collect();
write_documents_addition_index(
writer,
main_store,
postings_lists_store,
docs_words_store,
&ranked_map,
number_of_inserted_documents,
indexer,
)
}
pub fn apply_documents_partial_addition<'a, 'b>(
writer: &'a mut heed::RwTxn<'b>,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
addition: Vec<HashMap<String, serde_json::Value>>,
) -> MResult<()> {
let mut documents_additions = HashMap::new();
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let identifier = schema.identifier_name();
// 1. store documents ids for future deletion
for mut document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
let mut deserializer = Deserializer {
document_id,
reader: writer,
documents_fields: documents_fields_store,
schema: &schema,
attributes: None,
};
// retrieve the old document and
// update the new one with missing keys found in the old one
let result = Option::<HashMap<String, serde_json::Value>>::deserialize(&mut deserializer)?;
if let Some(old_document) = result {
for (key, value) in old_document {
document.entry(key).or_insert(value);
}
}
documents_additions.insert(document_id, document);
}
// 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion(
writer,
main_store,
@ -130,20 +234,133 @@ pub fn apply_documents_addition(
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
ranked_map.clone(),
documents_to_insert,
documents_ids,
)?;
// 2. insert new document attributes in the database
for ((id, attr), value) in document_store.into_inner() {
documents_fields_store.put_document_field(writer, id, attr, &value)?;
let mut ranked_map = match main_store.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer {
txn: writer,
schema: &schema,
document_store: documents_fields_store,
document_fields_counts: documents_fields_counts_store,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
// 3. insert new document attributes counts
for ((id, attr), count) in document_fields_counts {
documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?;
write_documents_addition_index(
writer,
main_store,
postings_lists_store,
docs_words_store,
&ranked_map,
number_of_inserted_documents,
indexer,
)
}
pub fn reindex_all_documents(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = RankedMap::default();
// 1. retrieve all documents ids
let mut documents_ids_to_reindex = Vec::new();
for result in documents_fields_counts_store.documents_ids(writer)? {
let document_id = result?;
documents_ids_to_reindex.push(document_id);
}
// 2. remove the documents posting lists
main_store.put_words_fst(writer, &fst::Set::default())?;
main_store.put_ranked_map(writer, &ranked_map)?;
main_store.put_number_of_documents(writer, |_| 0)?;
postings_lists_store.clear(writer)?;
docs_words_store.clear(writer)?;
// 3. re-index chunks of documents (otherwise we make the borrow checker unhappy)
for documents_ids in documents_ids_to_reindex.chunks(100) {
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
let number_of_inserted_documents = documents_ids.len();
let mut indexer = RawIndexer::new(stop_words);
let mut ram_store = HashMap::new();
for document_id in documents_ids {
for result in documents_fields_store.document_fields(writer, *document_id)? {
let (attr, bytes) = result?;
let value: serde_json::Value = serde_json::from_slice(bytes)?;
ram_store.insert((document_id, attr), value);
}
for ((docid, attr), value) in ram_store.drain() {
serialize_value(
writer,
attr,
schema.props(attr),
*docid,
documents_fields_store,
documents_fields_counts_store,
&mut indexer,
&mut ranked_map,
&value,
)?;
}
}
// 4. write the new index in the main store
write_documents_addition_index(
writer,
main_store,
postings_lists_store,
docs_words_store,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
}
Ok(())
}
pub fn write_documents_addition_index(
writer: &mut heed::RwTxn,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer,
) -> MResult<()> {
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
@ -185,10 +402,8 @@ pub fn apply_documents_addition(
};
main_store.put_words_fst(writer, &words)?;
main_store.put_ranked_map(writer, &ranked_map)?;
let inserted_documents_len = document_ids.len() as u64;
main_store.put_number_of_documents(writer, |old| old + inserted_documents_len)?;
main_store.put_ranked_map(writer, ranked_map)?;
main_store.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
Ok(())
}

View File

@ -49,7 +49,7 @@ impl DocumentsDeletion {
Ok(())
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_documents_deletion(
writer,
@ -68,7 +68,7 @@ impl Extend<DocumentId> for DocumentsDeletion {
}
pub fn push_documents_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: Vec<DocumentId>,
@ -82,13 +82,12 @@ pub fn push_documents_deletion(
}
pub fn apply_documents_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
deletion: Vec<DocumentId>,
) -> MResult<()> {
let idset = SetBuf::from_dirty(deletion);
@ -98,6 +97,11 @@ pub fn apply_documents_deletion(
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = match main_store.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
// collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema
.iter()
@ -181,7 +185,6 @@ pub fn apply_documents_deletion(
main_store.put_words_fst(writer, &words)?;
main_store.put_ranked_map(writer, &ranked_map)?;
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
Ok(())

View File

@ -1,70 +1,125 @@
mod clear_all;
mod customs_update;
mod documents_addition;
mod documents_deletion;
mod schema_update;
mod stop_words_addition;
mod stop_words_deletion;
mod synonyms_addition;
mod synonyms_deletion;
pub use self::clear_all::{apply_clear_all, push_clear_all};
pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
pub use self::documents_addition::{
apply_documents_addition, apply_documents_partial_addition, DocumentsAddition,
};
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion};
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::cmp;
use std::collections::BTreeMap;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::time::{Duration, Instant};
use heed::Result as ZResult;
use log::debug;
use serde::{Deserialize, Serialize};
use zlmdb::Result as ZResult;
use crate::{store, DocumentId, MResult, RankedMap};
use crate::{store, DocumentId, MResult};
use meilidb_schema::Schema;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Update {
ClearAll,
Schema(Schema),
Customs(Vec<u8>),
DocumentsAddition(Vec<serde_json::Value>),
DocumentsAddition(Vec<HashMap<String, serde_json::Value>>),
DocumentsPartial(Vec<HashMap<String, serde_json::Value>>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
StopWordsAddition(BTreeSet<String>),
StopWordsDeletion(BTreeSet<String>),
}
impl Update {
pub fn update_type(&self) -> UpdateType {
match self {
Update::ClearAll => UpdateType::ClearAll,
Update::Schema(schema) => UpdateType::Schema {
schema: schema.clone(),
},
Update::Customs(_) => UpdateType::Customs,
Update::DocumentsAddition(addition) => UpdateType::DocumentsAddition {
number: addition.len(),
},
Update::DocumentsPartial(addition) => UpdateType::DocumentsPartial {
number: addition.len(),
},
Update::DocumentsDeletion(deletion) => UpdateType::DocumentsDeletion {
number: deletion.len(),
},
Update::SynonymsAddition(addition) => UpdateType::SynonymsAddition {
number: addition.len(),
},
Update::SynonymsDeletion(deletion) => UpdateType::SynonymsDeletion {
number: deletion.len(),
},
Update::StopWordsAddition(addition) => UpdateType::StopWordsAddition {
number: addition.len(),
},
Update::StopWordsDeletion(deletion) => UpdateType::StopWordsDeletion {
number: deletion.len(),
},
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateType {
ClearAll,
Schema { schema: Schema },
Customs,
DocumentsAddition { number: usize },
DocumentsPartial { number: usize },
DocumentsDeletion { number: usize },
SynonymsAddition { number: usize },
SynonymsDeletion { number: usize },
StopWordsAddition { number: usize },
StopWordsDeletion { number: usize },
}
#[derive(Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetailedDuration {
pub main: Duration,
}
#[derive(Clone, Serialize, Deserialize)]
pub struct UpdateResult {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessedUpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
pub result: Result<(), String>,
pub detailed_duration: DetailedDuration,
}
#[derive(Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnqueuedUpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateStatus {
Enqueued,
Processed(UpdateResult),
Enqueued(EnqueuedUpdateResult),
Processed(ProcessedUpdateResult),
Unknown,
}
pub fn update_status(
reader: &zlmdb::RoTxn,
reader: &heed::RoTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
update_id: u64,
@ -72,8 +127,11 @@ pub fn update_status(
match updates_results_store.update_result(reader, update_id)? {
Some(result) => Ok(UpdateStatus::Processed(result)),
None => {
if updates_store.contains(reader, update_id)? {
Ok(UpdateStatus::Enqueued)
if let Some(update) = updates_store.get(reader, update_id)? {
Ok(UpdateStatus::Enqueued(EnqueuedUpdateResult {
update_id,
update_type: update.update_type(),
}))
} else {
Ok(UpdateStatus::Unknown)
}
@ -82,7 +140,7 @@ pub fn update_status(
}
pub fn next_update_id(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> ZResult<u64> {
@ -98,25 +156,45 @@ pub fn next_update_id(
Ok(new_update_id)
}
pub fn update_task(
writer: &mut zlmdb::RwTxn,
pub fn update_task<'a, 'b>(
writer: &'a mut heed::RwTxn<'b>,
index: store::Index,
) -> MResult<Option<UpdateResult>> {
let (update_id, update) = match index.updates.pop_front(writer)? {
Some(value) => value,
None => return Ok(None),
};
update_id: u64,
update: Update,
) -> MResult<ProcessedUpdateResult> {
debug!("Processing update number {}", update_id);
let (update_type, result, duration) = match update {
Update::ClearAll => {
let start = Instant::now();
let update_type = UpdateType::ClearAll;
let result = apply_clear_all(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
);
(update_type, result, start.elapsed())
}
Update::Schema(schema) => {
let start = Instant::now();
let update_type = UpdateType::Schema {
schema: schema.clone(),
};
let result = apply_schema_update(writer, index.main, &schema);
let result = apply_schema_update(
writer,
&schema,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
);
(update_type, result, start.elapsed())
}
@ -131,11 +209,6 @@ pub fn update_task(
Update::DocumentsAddition(documents) => {
let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsAddition {
number: documents.len(),
};
@ -147,7 +220,25 @@ pub fn update_task(
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
ranked_map,
documents,
);
(update_type, result, start.elapsed())
}
Update::DocumentsPartial(documents) => {
let start = Instant::now();
let update_type = UpdateType::DocumentsPartial {
number: documents.len(),
};
let result = apply_documents_partial_addition(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
documents,
);
@ -156,11 +247,6 @@ pub fn update_task(
Update::DocumentsDeletion(documents) => {
let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsDeletion {
number: documents.len(),
};
@ -172,7 +258,6 @@ pub fn update_task(
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
ranked_map,
documents,
);
@ -198,6 +283,37 @@ pub fn update_task(
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
}
Update::StopWordsAddition(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsAddition {
number: stop_words.len(),
};
let result =
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
(update_type, result, start.elapsed())
}
Update::StopWordsDeletion(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsDeletion {
number: stop_words.len(),
};
let result = apply_stop_words_deletion(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
stop_words,
);
(update_type, result, start.elapsed())
}
};
@ -208,16 +324,12 @@ pub fn update_task(
);
let detailed_duration = DetailedDuration { main: duration };
let status = UpdateResult {
let status = ProcessedUpdateResult {
update_id,
update_type,
result: result.map_err(|e| e.to_string()),
detailed_duration,
};
index
.updates_results
.put_update_result(writer, update_id, &status)?;
Ok(Some(status))
Ok(status)
}

View File

@ -1,23 +1,67 @@
use meilidb_schema::{Diff, Schema};
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update};
use crate::{error::UnsupportedOperation, store, MResult};
use meilidb_schema::Schema;
pub fn apply_schema_update(
writer: &mut zlmdb::RwTxn,
main_store: store::Main,
writer: &mut heed::RwTxn,
new_schema: &Schema,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
if main_store.schema(writer)?.is_some() {
return Err(UnsupportedOperation::SchemaAlreadyExists.into());
use UnsupportedOperation::{
CanOnlyIntroduceNewSchemaAttributesAtEnd, CannotRemoveSchemaAttribute,
CannotReorderSchemaAttribute, CannotUpdateSchemaIdentifier,
};
let mut need_full_reindexing = false;
if let Some(old_schema) = main_store.schema(writer)? {
for diff in meilidb_schema::diff(&old_schema, new_schema) {
match diff {
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
Diff::AttrMove { .. } => return Err(CannotReorderSchemaAttribute.into()),
Diff::AttrPropsChange { old, new, .. } => {
if new.indexed != old.indexed {
need_full_reindexing = true;
}
if new.ranked != old.ranked {
need_full_reindexing = true;
}
}
Diff::NewAttr { pos, .. } => {
// new attribute not at the end of the schema
if pos < old_schema.number_of_attributes() {
return Err(CanOnlyIntroduceNewSchemaAttributesAtEnd.into());
}
}
Diff::RemovedAttr { .. } => return Err(CannotRemoveSchemaAttribute.into()),
}
}
}
main_store
.put_schema(writer, new_schema)
.map_err(Into::into)
main_store.put_schema(writer, new_schema)?;
if need_full_reindexing {
reindex_all_documents(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
)?
}
Ok(())
}
pub fn push_schema_update(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
schema: Schema,

View File

@ -0,0 +1,116 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::automaton::normalize_str;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsAddition {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
stop_words: BTreeSet<String>,
}
impl StopWordsAddition {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> StopWordsAddition {
StopWordsAddition {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_stop_words_addition(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_addition(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsAddition(addition);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_addition(
writer: &mut heed::RwTxn,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
addition: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in addition {
stop_words_builder.insert(&word).unwrap();
// we remove every posting list associated to a new stop word
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// we also need to remove all the stop words from the main fst
if let Some(word_fst) = main_store.words_fst(writer)? {
let op = OpBuilder::new()
.add(&word_fst)
.add(&delta_stop_words)
.difference();
let mut word_fst_builder = SetBuilder::memory();
word_fst_builder.extend_stream(op).unwrap();
let word_fst = word_fst_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_words_fst(writer, &word_fst)?;
}
// now we add all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.r#union();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
Ok(())
}

View File

@ -0,0 +1,112 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::automaton::normalize_str;
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
stop_words: BTreeSet<String>,
}
impl StopWordsDeletion {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> StopWordsDeletion {
StopWordsDeletion {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn delete_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_stop_words_deletion(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_deletion(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsDeletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_deletion(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
deletion: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in deletion {
stop_words_builder.insert(&word).unwrap();
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// now we delete all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.difference();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
// now that we have setup the stop words
// lets reindex everything...
reindex_all_documents(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
)?;
Ok(())
}

View File

@ -42,7 +42,7 @@ impl SynonymsAddition {
.extend(alternatives);
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_synonyms_addition(
writer,
@ -55,7 +55,7 @@ impl SynonymsAddition {
}
pub fn push_synonyms_addition(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeMap<String, Vec<String>>,
@ -69,7 +69,7 @@ pub fn push_synonyms_addition(
}
pub fn apply_synonyms_addition(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
synonyms_store: store::Synonyms,
addition: BTreeMap<String, Vec<String>>,

View File

@ -49,7 +49,7 @@ impl SynonymsDeletion {
}
}
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_synonyms_deletion(
writer,
@ -62,7 +62,7 @@ impl SynonymsDeletion {
}
pub fn push_synonyms_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeMap<String, Option<Vec<String>>>,
@ -76,7 +76,7 @@ pub fn push_synonyms_deletion(
}
pub fn apply_synonyms_deletion(
writer: &mut zlmdb::RwTxn,
writer: &mut heed::RwTxn,
main_store: store::Main,
synonyms_store: store::Synonyms,
deletion: BTreeMap<String, Option<Vec<String>>>,

55
meilidb-http/Cargo.toml Normal file
View File

@ -0,0 +1,55 @@
[package]
name = "meilidb-http"
version = "0.1.0"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",
]
edition = "2018"
[dependencies]
bincode = "1.2.0"
chrono = { version = "0.4.9", features = ["serde"] }
crossbeam-channel = "0.3.9"
envconfig = "0.5.1"
envconfig_derive = "0.5.1"
heed = "0.5.0"
http = "0.1.19"
indexmap = { version = "1.3.0", features = ["serde-1"] }
jemallocator = "0.3.2"
log = "0.4.8"
main_error = "0.1.0"
meilidb-core = { path = "../meilidb-core", version = "0.6.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.6.0" }
pretty-bytes = "0.2.2"
rand = "0.7.2"
rayon = "1.2.0"
serde = { version = "1.0.101", features = ["derive"] }
serde_json = { version = "1.0.41", features = ["preserve_order"] }
structopt = "0.3.3"
sysinfo = "0.9.5"
walkdir = "2.2.9"
[dependencies.async-compression]
default-features = false
features = ["stream", "gzip", "zlib", "brotli", "zstd"]
version = "0.1.0-alpha.7"
[dependencies.tide]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[dependencies.tide-log]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[dependencies.tide-slog]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[dependencies.tide-compression]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[build-dependencies]
vergen = "3.0.4"

10
meilidb-http/build.rs Normal file
View File

@ -0,0 +1,10 @@
use vergen::{generate_cargo_keys, ConstantsFlags};
fn main() {
// Setup the flags, toggling off the 'SEMVER_FROM_CARGO_PKG' flag
let mut flags = ConstantsFlags::all();
flags.toggle(ConstantsFlags::SEMVER_FROM_CARGO_PKG);
// Generate the 'cargo:' key output
generate_cargo_keys(ConstantsFlags::all()).expect("Unable to generate the cargo keys!");
}

190
meilidb-http/src/data.rs Normal file
View File

@ -0,0 +1,190 @@
use std::collections::HashMap;
use std::ops::Deref;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use chrono::{DateTime, Utc};
use heed::types::{SerdeBincode, Str};
use log::*;
use meilidb_core::{Database, MResult};
use sysinfo::Pid;
use crate::option::Opt;
use crate::routes::index::index_update_callback;
pub type FreqsMap = HashMap<String, usize>;
type SerdeFreqsMap = SerdeBincode<FreqsMap>;
type SerdeDatetime = SerdeBincode<DateTime<Utc>>;
#[derive(Clone)]
pub struct Data {
inner: Arc<DataInner>,
}
impl Deref for Data {
type Target = DataInner;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
#[derive(Clone)]
pub struct DataInner {
pub db: Arc<Database>,
pub db_path: String,
pub admin_token: Option<String>,
pub server_pid: Pid,
pub accept_updates: Arc<AtomicBool>,
}
impl DataInner {
pub fn is_indexing(&self, reader: &heed::RoTxn, index: &str) -> MResult<Option<bool>> {
match self.db.open_index(&index) {
Some(index) => index.current_update_id(&reader).map(|u| Some(u.is_some())),
None => Ok(None),
}
}
pub fn last_update(
&self,
reader: &heed::RoTxn,
index_name: &str,
) -> MResult<Option<DateTime<Utc>>> {
let key = format!("last-update-{}", index_name);
match self
.db
.common_store()
.get::<Str, SerdeDatetime>(&reader, &key)?
{
Some(datetime) => Ok(Some(datetime)),
None => Ok(None),
}
}
pub fn set_last_update(&self, writer: &mut heed::RwTxn, index_name: &str) -> MResult<()> {
let key = format!("last-update-{}", index_name);
self.db
.common_store()
.put::<Str, SerdeDatetime>(writer, &key, &Utc::now())
.map_err(Into::into)
}
pub fn last_backup(&self, reader: &heed::RoTxn) -> MResult<Option<DateTime<Utc>>> {
match self
.db
.common_store()
.get::<Str, SerdeDatetime>(&reader, "last-backup")?
{
Some(datetime) => Ok(Some(datetime)),
None => Ok(None),
}
}
pub fn set_last_backup(&self, writer: &mut heed::RwTxn) -> MResult<()> {
self.db
.common_store()
.put::<Str, SerdeDatetime>(writer, "last-backup", &Utc::now())?;
Ok(())
}
pub fn fields_frequency(
&self,
reader: &heed::RoTxn,
index_name: &str,
) -> MResult<Option<FreqsMap>> {
let key = format!("fields-frequency-{}", index_name);
match self
.db
.common_store()
.get::<Str, SerdeFreqsMap>(&reader, &key)?
{
Some(freqs) => Ok(Some(freqs)),
None => Ok(None),
}
}
pub fn compute_stats(&self, writer: &mut heed::RwTxn, index_name: &str) -> MResult<()> {
let index = match self.db.open_index(&index_name) {
Some(index) => index,
None => {
error!("Impossible to retrieve index {}", index_name);
return Ok(());
}
};
let schema = match index.main.schema(&writer)? {
Some(schema) => schema,
None => return Ok(()),
};
let all_documents_fields = index
.documents_fields_counts
.all_documents_fields_counts(&writer)?;
// count fields frequencies
let mut fields_frequency = HashMap::<_, usize>::new();
for result in all_documents_fields {
let (_, attr, _) = result?;
*fields_frequency.entry(attr).or_default() += 1;
}
// convert attributes to their names
let frequency: HashMap<_, _> = fields_frequency
.into_iter()
.map(|(a, c)| (schema.attribute_name(a).to_owned(), c))
.collect();
let key = format!("fields-frequency-{}", index_name);
self.db
.common_store()
.put::<Str, SerdeFreqsMap>(writer, &key, &frequency)?;
Ok(())
}
pub fn stop_accept_updates(&self) {
self.accept_updates.store(false, Ordering::Relaxed);
}
pub fn accept_updates(&self) -> bool {
self.accept_updates.load(Ordering::Relaxed)
}
}
impl Data {
pub fn new(opt: Opt) -> Data {
let db_path = opt.database_path.clone();
let admin_token = opt.admin_token.clone();
let server_pid = sysinfo::get_current_pid().unwrap();
let db = Arc::new(Database::open_or_create(opt.database_path.clone()).unwrap());
let accept_updates = Arc::new(AtomicBool::new(true));
let inner_data = DataInner {
db: db.clone(),
db_path,
admin_token,
server_pid,
accept_updates,
};
let data = Data {
inner: Arc::new(inner_data),
};
for index_name in db.indexes_names().unwrap() {
let callback_context = data.clone();
let callback_name = index_name.clone();
db.set_update_callback(
index_name,
Box::new(move |status| {
index_update_callback(&callback_name, &callback_context, status);
}),
);
}
data
}
}

117
meilidb-http/src/error.rs Normal file
View File

@ -0,0 +1,117 @@
use std::fmt::Display;
use http::status::StatusCode;
use log::{error, warn};
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::Response;
pub type SResult<T> = Result<T, ResponseError>;
pub enum ResponseError {
Internal(String),
BadRequest(String),
InvalidToken(String),
NotFound(String),
IndexNotFound(String),
DocumentNotFound(String),
MissingHeader(String),
BadParameter(String, String),
CreateIndex(String),
Maintenance,
}
impl ResponseError {
pub fn internal(message: impl Display) -> ResponseError {
ResponseError::Internal(message.to_string())
}
pub fn bad_request(message: impl Display) -> ResponseError {
ResponseError::BadRequest(message.to_string())
}
pub fn invalid_token(message: impl Display) -> ResponseError {
ResponseError::InvalidToken(message.to_string())
}
pub fn not_found(message: impl Display) -> ResponseError {
ResponseError::NotFound(message.to_string())
}
pub fn index_not_found(message: impl Display) -> ResponseError {
ResponseError::IndexNotFound(message.to_string())
}
pub fn document_not_found(message: impl Display) -> ResponseError {
ResponseError::DocumentNotFound(message.to_string())
}
pub fn missing_header(message: impl Display) -> ResponseError {
ResponseError::MissingHeader(message.to_string())
}
pub fn bad_parameter(name: impl Display, message: impl Display) -> ResponseError {
ResponseError::BadParameter(name.to_string(), message.to_string())
}
pub fn create_index(message: impl Display) -> ResponseError {
ResponseError::CreateIndex(message.to_string())
}
}
impl IntoResponse for ResponseError {
fn into_response(self) -> Response {
match self {
ResponseError::Internal(err) => {
error!("internal server error: {}", err);
error(
String::from("Internal server error"),
StatusCode::INTERNAL_SERVER_ERROR,
)
}
ResponseError::BadRequest(err) => {
warn!("bad request: {}", err);
error(err, StatusCode::BAD_REQUEST)
}
ResponseError::InvalidToken(err) => {
error(format!("Invalid Token: {}", err), StatusCode::FORBIDDEN)
}
ResponseError::NotFound(err) => error(err, StatusCode::NOT_FOUND),
ResponseError::IndexNotFound(index) => {
error(format!("Index {} not found", index), StatusCode::NOT_FOUND)
}
ResponseError::DocumentNotFound(id) => error(
format!("Document with id {} not found", id),
StatusCode::NOT_FOUND,
),
ResponseError::MissingHeader(header) => error(
format!("Header {} is missing", header),
StatusCode::UNAUTHORIZED,
),
ResponseError::BadParameter(param, e) => error(
format!("Url parameter {} error: {}", param, e),
StatusCode::BAD_REQUEST,
),
ResponseError::CreateIndex(err) => error(
format!("Impossible to create index; {}", err),
StatusCode::BAD_REQUEST,
),
ResponseError::Maintenance => error(
String::from("Server is in maintenance, please try again later"),
StatusCode::SERVICE_UNAVAILABLE,
),
}
}
}
#[derive(Serialize, Deserialize)]
struct ErrorMessage {
message: String,
}
fn error(message: String, status: StatusCode) -> Response {
let message = ErrorMessage { message };
tide::response::json(message)
.with_status(status)
.into_response()
}

View File

@ -0,0 +1,568 @@
use crate::routes::setting::{RankingOrdering, SettingBody};
use indexmap::IndexMap;
use log::*;
use meilidb_core::criterion::*;
use meilidb_core::Highlight;
use meilidb_core::{Index, RankedMap};
use meilidb_schema::{Schema, SchemaAttr};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::convert::From;
use std::error;
use std::fmt;
use std::time::{Duration, Instant};
#[derive(Debug)]
pub enum Error {
SearchDocuments(String),
RetrieveDocument(u64, String),
DocumentNotFound(u64),
CropFieldWrongType(String),
AttributeNotFoundOnDocument(String),
AttributeNotFoundOnSchema(String),
MissingFilterValue,
UnknownFilteredAttribute,
Internal(String),
}
impl error::Error for Error {}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use Error::*;
match self {
SearchDocuments(err) => write!(f, "impossible to search documents; {}", err),
RetrieveDocument(id, err) => write!(
f,
"impossible to retrieve the document with id: {}; {}",
id, err
),
DocumentNotFound(id) => write!(f, "document {} not found", id),
CropFieldWrongType(field) => {
write!(f, "the field {} cannot be cropped it's not a string", field)
}
AttributeNotFoundOnDocument(field) => {
write!(f, "field {} is not found on document", field)
}
AttributeNotFoundOnSchema(field) => write!(f, "field {} is not found on schema", field),
MissingFilterValue => f.write_str("a filter doesn't have a value to compare it with"),
UnknownFilteredAttribute => {
f.write_str("a filter is specifying an unknown schema attribute")
}
Internal(err) => write!(f, "internal error; {}", err),
}
}
}
impl From<meilidb_core::Error> for Error {
fn from(error: meilidb_core::Error) -> Self {
Error::Internal(error.to_string())
}
}
pub trait IndexSearchExt {
fn new_search(&self, query: String) -> SearchBuilder;
}
impl IndexSearchExt for Index {
fn new_search(&self, query: String) -> SearchBuilder {
SearchBuilder {
index: self,
query,
offset: 0,
limit: 20,
attributes_to_crop: None,
attributes_to_retrieve: None,
attributes_to_search_in: None,
attributes_to_highlight: None,
filters: None,
timeout: Duration::from_millis(30),
matches: false,
}
}
}
pub struct SearchBuilder<'a> {
index: &'a Index,
query: String,
offset: usize,
limit: usize,
attributes_to_crop: Option<HashMap<String, usize>>,
attributes_to_retrieve: Option<HashSet<String>>,
attributes_to_search_in: Option<HashSet<String>>,
attributes_to_highlight: Option<HashSet<String>>,
filters: Option<String>,
timeout: Duration,
matches: bool,
}
impl<'a> SearchBuilder<'a> {
pub fn offset(&mut self, value: usize) -> &SearchBuilder {
self.offset = value;
self
}
pub fn limit(&mut self, value: usize) -> &SearchBuilder {
self.limit = value;
self
}
pub fn attributes_to_crop(&mut self, value: HashMap<String, usize>) -> &SearchBuilder {
self.attributes_to_crop = Some(value);
self
}
pub fn attributes_to_retrieve(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_retrieve = Some(value);
self
}
pub fn add_retrievable_field(&mut self, value: String) -> &SearchBuilder {
let attributes_to_retrieve = self.attributes_to_retrieve.get_or_insert(HashSet::new());
attributes_to_retrieve.insert(value);
self
}
pub fn attributes_to_search_in(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_search_in = Some(value);
self
}
pub fn attributes_to_highlight(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_highlight = Some(value);
self
}
pub fn filters(&mut self, value: String) -> &SearchBuilder {
self.filters = Some(value);
self
}
pub fn timeout(&mut self, value: Duration) -> &SearchBuilder {
self.timeout = value;
self
}
pub fn get_matches(&mut self) -> &SearchBuilder {
self.matches = true;
self
}
pub fn search(&self, reader: &heed::RoTxn) -> Result<SearchResult, Error> {
let schema = self.index.main.schema(reader);
let schema = schema.map_err(|e| Error::Internal(e.to_string()))?;
let schema = match schema {
Some(schema) => schema,
None => return Err(Error::Internal(String::from("missing schema"))),
};
let ranked_map = self.index.main.ranked_map(reader);
let ranked_map = ranked_map.map_err(|e| Error::Internal(e.to_string()))?;
let ranked_map = ranked_map.unwrap_or_default();
let start = Instant::now();
// Change criteria
let mut query_builder = match self.get_criteria(reader, &ranked_map, &schema)? {
Some(criteria) => self.index.query_builder_with_criteria(criteria),
None => self.index.query_builder(),
};
// Filter searchable fields
if let Some(fields) = &self.attributes_to_search_in {
for attribute in fields.iter().filter_map(|f| schema.attribute(f)) {
query_builder.add_searchable_attribute(attribute.0);
}
}
if let Some(filters) = &self.filters {
let mut split = filters.split(':');
match (split.next(), split.next()) {
(Some(_), None) | (Some(_), Some("")) => return Err(Error::MissingFilterValue),
(Some(attr), Some(value)) => {
let ref_reader = reader;
let ref_index = &self.index;
let value = value.trim().to_lowercase();
let attr = match schema.attribute(attr) {
Some(attr) => attr,
None => return Err(Error::UnknownFilteredAttribute),
};
query_builder.with_filter(move |id| {
let attr = attr;
let index = ref_index;
let reader = ref_reader;
match index.document_attribute::<Value>(reader, id, attr) {
Ok(Some(Value::String(s))) => s.to_lowercase() == value,
Ok(Some(Value::Bool(b))) => {
(value == "true" && b) || (value == "false" && !b)
}
Ok(Some(Value::Array(a))) => {
a.into_iter().any(|s| s.as_str() == Some(&value))
}
_ => false,
}
});
}
(_, _) => (),
}
}
query_builder.with_fetch_timeout(self.timeout);
let docs =
query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
let mut hits = Vec::with_capacity(self.limit);
for doc in docs.map_err(|e| Error::SearchDocuments(e.to_string()))? {
// retrieve the content of document in kv store
let mut fields: Option<HashSet<&str>> = None;
if let Some(attributes_to_retrieve) = &self.attributes_to_retrieve {
let mut set = HashSet::new();
for field in attributes_to_retrieve {
set.insert(field.as_str());
}
fields = Some(set);
}
let mut document: IndexMap<String, Value> = self
.index
.document(reader, fields.as_ref(), doc.id)
.map_err(|e| Error::RetrieveDocument(doc.id.0, e.to_string()))?
.ok_or(Error::DocumentNotFound(doc.id.0))?;
let mut matches = doc.highlights.clone();
// Crops fields if needed
if let Some(fields) = self.attributes_to_crop.clone() {
for (field, length) in fields {
let _ = crop_document(&mut document, &mut matches, &schema, &field, length);
}
}
// Transform to readable matches
let matches = calculate_matches(matches, self.attributes_to_retrieve.clone(), &schema);
if !self.matches {
if let Some(attributes_to_highlight) = self.attributes_to_highlight.clone() {
let highlights = calculate_highlights(
document.clone(),
matches.clone(),
attributes_to_highlight,
);
for (key, value) in highlights {
if let Some(content) = document.get_mut(&key) {
*content = value;
}
}
}
}
let matches_info = if self.matches { Some(matches) } else { None };
let hit = SearchHit {
hit: document,
matches_info,
};
hits.push(hit);
}
let time_ms = start.elapsed().as_millis() as usize;
let results = SearchResult {
hits,
offset: self.offset,
limit: self.limit,
processing_time_ms: time_ms,
query: self.query.to_string(),
};
Ok(results)
}
pub fn get_criteria(
&self,
reader: &heed::RoTxn,
ranked_map: &'a RankedMap,
schema: &Schema,
) -> Result<Option<Criteria<'a>>, Error> {
let current_settings = match self.index.main.customs(reader).unwrap() {
Some(bytes) => bincode::deserialize(bytes).unwrap(),
None => SettingBody::default(),
};
let ranking_rules = &current_settings.ranking_rules;
let ranking_order = &current_settings.ranking_order;
if let Some(ranking_rules) = ranking_rules {
let mut builder = CriteriaBuilder::with_capacity(7 + ranking_rules.len());
if let Some(ranking_rules_order) = ranking_order {
for rule in ranking_rules_order {
match rule.as_str() {
"_sum_of_typos" => builder.push(SumOfTypos),
"_number_of_words" => builder.push(NumberOfWords),
"_word_proximity" => builder.push(WordsProximity),
"_sum_of_words_attribute" => builder.push(SumOfWordsAttribute),
"_sum_of_words_position" => builder.push(SumOfWordsPosition),
"_exact" => builder.push(Exact),
_ => {
let order = match ranking_rules.get(rule.as_str()) {
Some(o) => o,
None => continue,
};
let custom_ranking = match order {
RankingOrdering::Asc => {
SortByAttr::lower_is_better(&ranked_map, &schema, &rule)
.unwrap()
}
RankingOrdering::Dsc => {
SortByAttr::higher_is_better(&ranked_map, &schema, &rule)
.unwrap()
}
};
builder.push(custom_ranking);
}
}
}
builder.push(DocumentId);
return Ok(Some(builder.build()));
} else {
builder.push(SumOfTypos);
builder.push(NumberOfWords);
builder.push(WordsProximity);
builder.push(SumOfWordsAttribute);
builder.push(SumOfWordsPosition);
builder.push(Exact);
for (rule, order) in ranking_rules.iter() {
let custom_ranking = match order {
RankingOrdering::Asc => {
SortByAttr::lower_is_better(&ranked_map, &schema, &rule).unwrap()
}
RankingOrdering::Dsc => {
SortByAttr::higher_is_better(&ranked_map, &schema, &rule).unwrap()
}
};
builder.push(custom_ranking);
}
builder.push(DocumentId);
return Ok(Some(builder.build()));
}
}
Ok(None)
}
}
#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Serialize, Deserialize)]
pub struct MatchPosition {
pub start: usize,
pub length: usize,
}
impl Ord for MatchPosition {
fn cmp(&self, other: &Self) -> Ordering {
match self.start.cmp(&other.start) {
Ordering::Equal => self.length.cmp(&other.length),
_ => self.start.cmp(&other.start),
}
}
}
pub type HighlightInfos = HashMap<String, Value>;
pub type MatchesInfos = HashMap<String, Vec<MatchPosition>>;
// pub type RankingInfos = HashMap<String, u64>;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchHit {
#[serde(flatten)]
pub hit: IndexMap<String, Value>,
#[serde(rename = "_matchesInfo", skip_serializing_if = "Option::is_none")]
pub matches_info: Option<MatchesInfos>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub hits: Vec<SearchHit>,
pub offset: usize,
pub limit: usize,
pub processing_time_ms: usize,
pub query: String,
// pub parsed_query: String,
// pub params: Option<String>,
}
fn crop_text(
text: &str,
matches: impl IntoIterator<Item = Highlight>,
context: usize,
) -> (String, Vec<Highlight>) {
let mut matches = matches.into_iter().peekable();
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let matches = matches
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
.map(|match_| Highlight {
char_index: match_.char_index - start as u16,
..match_
})
.collect();
(text, matches)
}
fn crop_document(
document: &mut IndexMap<String, Value>,
matches: &mut Vec<Highlight>,
schema: &Schema,
field: &str,
length: usize,
) -> Result<(), Error> {
matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let attribute = schema
.attribute(field)
.ok_or(Error::AttributeNotFoundOnSchema(field.to_string()))?;
let selected_matches = matches
.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attribute)
.cloned();
let original_text = match document.get(field) {
Some(Value::String(text)) => text,
Some(_) => return Err(Error::CropFieldWrongType(field.to_string())),
None => return Err(Error::AttributeNotFoundOnDocument(field.to_string())),
};
let (cropped_text, cropped_matches) = crop_text(&original_text, selected_matches, length);
document.insert(
field.to_string(),
serde_json::value::Value::String(cropped_text),
);
matches.retain(|m| SchemaAttr::new(m.attribute) != attribute);
matches.extend_from_slice(&cropped_matches);
Ok(())
}
fn calculate_matches(
matches: Vec<Highlight>,
attributes_to_retrieve: Option<HashSet<String>>,
schema: &Schema,
) -> MatchesInfos {
let mut matches_result: HashMap<String, Vec<MatchPosition>> = HashMap::new();
for m in matches.iter() {
let attribute = schema
.attribute_name(SchemaAttr::new(m.attribute))
.to_string();
if let Some(attributes_to_retrieve) = attributes_to_retrieve.clone() {
if !attributes_to_retrieve.contains(attribute.as_str()) {
continue;
}
};
if let Some(pos) = matches_result.get_mut(&attribute) {
pos.push(MatchPosition {
start: m.char_index as usize,
length: m.char_length as usize,
});
} else {
let mut positions = Vec::new();
positions.push(MatchPosition {
start: m.char_index as usize,
length: m.char_length as usize,
});
matches_result.insert(attribute, positions);
}
}
for (_, val) in matches_result.iter_mut() {
val.sort_unstable();
val.dedup();
}
matches_result
}
fn calculate_highlights(
document: IndexMap<String, Value>,
matches: MatchesInfos,
attributes_to_highlight: HashSet<String>,
) -> HighlightInfos {
let mut highlight_result: HashMap<String, Value> = HashMap::new();
for (attribute, matches) in matches.iter() {
if attributes_to_highlight.contains("*") || attributes_to_highlight.contains(attribute) {
if let Some(Value::String(value)) = document.get(attribute) {
let value: Vec<_> = value.chars().collect();
let mut highlighted_value = String::new();
let mut index = 0;
for m in matches {
if m.start >= index {
let before = value.get(index..m.start);
let highlighted = value.get(m.start..(m.start + m.length));
if let (Some(before), Some(highlighted)) = (before, highlighted) {
highlighted_value.extend(before);
highlighted_value.push_str("<em>");
highlighted_value.extend(highlighted);
highlighted_value.push_str("</em>");
index = m.start + m.length;
} else {
error!("value: {:?}; index: {:?}, match: {:?}", value, index, m);
}
}
}
highlighted_value.extend(value[index..].iter());
highlight_result.insert(attribute.to_string(), Value::String(highlighted_value));
};
}
}
highlight_result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn calculate_highlights() {
let data = r#"{
"title": "Fondation (Isaac ASIMOV)",
"description": "En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la Fondation."
}"#;
let document: IndexMap<String, Value> = serde_json::from_str(data).unwrap();
let mut attributes_to_highlight = HashSet::new();
attributes_to_highlight.insert("*".to_string());
let mut matches: HashMap<String, Vec<MatchPosition>> = HashMap::new();
let mut m = Vec::new();
m.push(MatchPosition {
start: 0,
length: 9,
});
matches.insert("title".to_string(), m);
let mut m = Vec::new();
m.push(MatchPosition {
start: 510,
length: 9,
});
matches.insert("description".to_string(), m);
let result = super::calculate_highlights(document, matches, attributes_to_highlight);
let mut result_expected = HashMap::new();
result_expected.insert(
"title".to_string(),
Value::String("<em>Fondation</em> (Isaac ASIMOV)".to_string()),
);
result_expected.insert("description".to_string(), Value::String("En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la <em>Fondation</em>.".to_string()));
assert_eq!(result, result_expected);
}
}

View File

@ -0,0 +1,2 @@
pub mod meilidb;
pub mod tide;

View File

@ -0,0 +1,118 @@
use crate::error::{ResponseError, SResult};
use crate::models::token::*;
use crate::Data;
use chrono::Utc;
use heed::types::{SerdeBincode, Str};
use meilidb_core::Index;
use serde_json::Value;
use tide::Context;
pub trait ContextExt {
fn is_allowed(&self, acl: ACL) -> SResult<()>;
fn header(&self, name: &str) -> Result<String, ResponseError>;
fn url_param(&self, name: &str) -> Result<String, ResponseError>;
fn index(&self) -> Result<Index, ResponseError>;
fn identifier(&self) -> Result<String, ResponseError>;
}
impl ContextExt for Context<Data> {
fn is_allowed(&self, acl: ACL) -> SResult<()> {
let admin_token = match &self.state().admin_token {
Some(admin_token) => admin_token,
None => return Ok(()),
};
let user_api_key = self.header("X-Meili-API-Key")?;
if user_api_key == *admin_token {
return Ok(());
}
let request_index: Option<String> = None; //self.param::<String>("index").ok();
let db = &self.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, user_api_key);
let token_config = db
.common_store()
.get::<Str, SerdeBincode<Token>>(&reader, &token_key)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found(format!(
"token key: {}",
token_key
)))?;
if token_config.revoked {
return Err(ResponseError::invalid_token("token revoked"));
}
if let Some(index) = request_index {
if !token_config
.indexes
.iter()
.any(|r| match_wildcard(&r, &index))
{
return Err(ResponseError::invalid_token(
"token is not allowed to access to this index",
));
}
}
if token_config.expires_at < Utc::now() {
return Err(ResponseError::invalid_token("token expired"));
}
if token_config.acl.contains(&ACL::All) {
return Ok(());
}
if !token_config.acl.contains(&acl) {
return Err(ResponseError::invalid_token("token do not have this ACL"));
}
Ok(())
}
fn header(&self, name: &str) -> Result<String, ResponseError> {
let header = self
.headers()
.get(name)
.ok_or(ResponseError::missing_header(name))?
.to_str()
.map_err(|_| ResponseError::missing_header("X-Meili-API-Key"))?
.to_string();
Ok(header)
}
fn url_param(&self, name: &str) -> Result<String, ResponseError> {
let param = self
.param::<String>(name)
.map_err(|e| ResponseError::bad_parameter(name, e))?;
Ok(param)
}
fn index(&self) -> Result<Index, ResponseError> {
let index_name = self.url_param("index")?;
let index = self
.state()
.db
.open_index(&index_name)
.ok_or(ResponseError::index_not_found(index_name))?;
Ok(index)
}
fn identifier(&self) -> Result<String, ResponseError> {
let name = self
.param::<Value>("identifier")
.as_ref()
.map(meilidb_core::serde::value_to_string)
.map_err(|e| ResponseError::bad_parameter("identifier", e))?
.ok_or(ResponseError::bad_parameter(
"identifier",
"missing parameter",
))?;
Ok(name)
}
}

11
meilidb-http/src/lib.rs Normal file
View File

@ -0,0 +1,11 @@
#[macro_use]
extern crate envconfig_derive;
pub mod data;
pub mod error;
pub mod helpers;
pub mod models;
pub mod option;
pub mod routes;
use self::data::Data;

36
meilidb-http/src/main.rs Normal file
View File

@ -0,0 +1,36 @@
use http::header::HeaderValue;
use log::info;
use main_error::MainError;
use tide::middleware::{CorsMiddleware, CorsOrigin};
use tide_log::RequestLogger;
use meilidb_http::data::Data;
use meilidb_http::option::Opt;
use meilidb_http::routes;
#[cfg(not(target_os = "macos"))]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
pub fn main() -> Result<(), MainError> {
let opt = Opt::new();
let data = Data::new(opt.clone());
let mut app = tide::App::with_state(data);
app.middleware(
CorsMiddleware::new()
.allow_origin(CorsOrigin::from("*"))
.allow_methods(HeaderValue::from_static("GET, POST, OPTIONS")),
);
app.middleware(RequestLogger::new());
app.middleware(tide_compression::Compression::new());
app.middleware(tide_compression::Decompression::new());
routes::load_routes(&mut app);
info!("Server HTTP enabled");
app.run(opt.http_addr)?;
Ok(())
}

View File

@ -0,0 +1,3 @@
pub mod schema;
pub mod token;
pub mod update_operation;

View File

@ -0,0 +1,118 @@
use std::collections::HashSet;
use indexmap::IndexMap;
use meilidb_schema::{Schema, SchemaBuilder, SchemaProps};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum FieldProperties {
Identifier,
Indexed,
Displayed,
Ranked,
}
#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub struct SchemaBody(IndexMap<String, HashSet<FieldProperties>>);
impl From<Schema> for SchemaBody {
fn from(value: Schema) -> SchemaBody {
let mut map = IndexMap::new();
for (name, _attr, props) in value.iter() {
let old_properties = map.entry(name.to_owned()).or_insert(HashSet::new());
if props.is_indexed() {
old_properties.insert(FieldProperties::Indexed);
}
if props.is_displayed() {
old_properties.insert(FieldProperties::Displayed);
}
if props.is_ranked() {
old_properties.insert(FieldProperties::Ranked);
}
}
let old_properties = map
.entry(value.identifier_name().to_string())
.or_insert(HashSet::new());
old_properties.insert(FieldProperties::Identifier);
old_properties.insert(FieldProperties::Displayed);
SchemaBody(map)
}
}
impl Into<Schema> for SchemaBody {
fn into(self) -> Schema {
let mut identifier = "documentId".to_string();
let mut attributes = IndexMap::new();
for (field, properties) in self.0 {
let mut indexed = false;
let mut displayed = false;
let mut ranked = false;
for property in properties {
match property {
FieldProperties::Indexed => indexed = true,
FieldProperties::Displayed => displayed = true,
FieldProperties::Ranked => ranked = true,
FieldProperties::Identifier => identifier = field.clone(),
}
}
attributes.insert(
field,
SchemaProps {
indexed,
displayed,
ranked,
},
);
}
let mut builder = SchemaBuilder::with_identifier(identifier);
for (field, props) in attributes {
builder.new_attribute(field, props);
}
builder.build()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_schema_body_conversion() {
let schema_body = r#"
{
"id": ["identifier", "indexed", "displayed"],
"title": ["indexed", "displayed"],
"date": ["displayed"]
}
"#;
let schema_builder = r#"
{
"identifier": "id",
"attributes": {
"id": {
"indexed": true,
"displayed": true
},
"title": {
"indexed": true,
"displayed": true
},
"date": {
"displayed": true
}
}
}
"#;
let schema_body: SchemaBody = serde_json::from_str(schema_body).unwrap();
let schema_builder: SchemaBuilder = serde_json::from_str(schema_builder).unwrap();
let schema_from_body: Schema = schema_body.into();
let schema_from_builder: Schema = schema_builder.build();
assert_eq!(schema_from_body, schema_from_builder);
}
}

View File

@ -0,0 +1,72 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
pub const TOKEN_PREFIX_KEY: &str = "_token_";
#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum ACL {
IndexesRead,
IndexesWrite,
DocumentsRead,
DocumentsWrite,
SettingsRead,
SettingsWrite,
Admin,
#[serde(rename = "*")]
All,
}
pub type Wildcard = String;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Token {
pub key: String,
pub description: String,
pub acl: Vec<ACL>,
pub indexes: Vec<Wildcard>,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
pub expires_at: DateTime<Utc>,
pub revoked: bool,
}
fn cleanup_wildcard(input: &str) -> (bool, &str, bool) {
let first = input.chars().next().filter(|&c| c == '*').is_some();
let last = input.chars().last().filter(|&c| c == '*').is_some();
let bound_last = std::cmp::max(input.len().saturating_sub(last as usize), first as usize);
let output = input.get(first as usize..bound_last).unwrap();
(first, output, last)
}
pub fn match_wildcard(pattern: &str, input: &str) -> bool {
let (first, pattern, last) = cleanup_wildcard(pattern);
match (first, last) {
(false, false) => pattern == input,
(true, false) => input.ends_with(pattern),
(false, true) => input.starts_with(pattern),
(true, true) => input.contains(pattern),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_match_wildcard() {
assert!(match_wildcard("*", "qqq"));
assert!(match_wildcard("*", ""));
assert!(match_wildcard("*ab", "qqqab"));
assert!(match_wildcard("*ab*", "qqqabqq"));
assert!(match_wildcard("ab*", "abqqq"));
assert!(match_wildcard("**", "ab"));
assert!(match_wildcard("ab", "ab"));
assert!(match_wildcard("ab*", "ab"));
assert!(match_wildcard("*ab", "ab"));
assert!(match_wildcard("*ab*", "ab"));
assert!(match_wildcard("*😆*", "ab😆dsa"));
}
}

View File

@ -0,0 +1,33 @@
use std::fmt;
#[allow(dead_code)]
#[derive(Debug)]
pub enum UpdateOperation {
ClearAllDocuments,
DocumentsAddition,
DocumentsDeletion,
SynonymsAddition,
SynonymsDeletion,
StopWordsAddition,
StopWordsDeletion,
Schema,
Config,
}
impl fmt::Display for UpdateOperation {
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
use UpdateOperation::*;
match self {
ClearAllDocuments => write!(f, "ClearAllDocuments"),
DocumentsAddition => write!(f, "DocumentsAddition"),
DocumentsDeletion => write!(f, "DocumentsDeletion"),
SynonymsAddition => write!(f, "SynonymsAddition"),
SynonymsDeletion => write!(f, "SynonymsDelettion"),
StopWordsAddition => write!(f, "StopWordsAddition"),
StopWordsDeletion => write!(f, "StopWordsDeletion"),
Schema => write!(f, "Schema"),
Config => write!(f, "Config"),
}
}
}

View File

@ -0,0 +1,56 @@
use envconfig::Envconfig;
use structopt::StructOpt;
#[derive(Debug, Clone, StructOpt, Envconfig)]
struct Vars {
/// The destination where the database must be created.
#[structopt(long)]
#[envconfig(from = "MEILI_DATABASE_PATH")]
pub database_path: Option<String>,
/// The addr on which the http server will listen.
#[structopt(long)]
#[envconfig(from = "MEILI_HTTP_ADDR")]
pub http_addr: Option<String>,
#[structopt(long)]
#[envconfig(from = "MEILI_ADMIN_TOKEN")]
pub admin_token: Option<String>,
}
#[derive(Clone, Debug)]
pub struct Opt {
pub database_path: String,
pub http_addr: String,
pub admin_token: Option<String>,
}
impl Default for Opt {
fn default() -> Self {
Opt {
database_path: String::from("/tmp/meilidb"),
http_addr: String::from("127.0.0.1:8080"),
admin_token: None,
}
}
}
impl Opt {
pub fn new() -> Self {
let default = Self::default();
let args = Vars::from_args();
let env = Vars::init().unwrap();
Self {
database_path: env
.database_path
.or(args.database_path)
.unwrap_or(default.database_path),
http_addr: env
.http_addr
.or(args.http_addr)
.unwrap_or(default.http_addr),
admin_token: env.admin_token.or(args.admin_token).or(default.admin_token),
}
}
}

View File

@ -0,0 +1,250 @@
use std::collections::{BTreeSet, HashSet};
use http::StatusCode;
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use tide::querystring::ContextExt as QSContextExt;
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::Data;
pub async fn get_document(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsRead)?;
let index = ctx.index()?;
let identifier = ctx.identifier()?;
let document_id = meilidb_core::serde::compute_document_id(identifier.clone());
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let response = index
.document::<IndexMap<String, Value>>(&reader, None, document_id)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::document_not_found(&identifier))?;
if response.is_empty() {
return Err(ResponseError::document_not_found(identifier));
}
Ok(tide::response::json(response))
}
#[derive(Default, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct IndexUpdateResponse {
pub update_id: u64,
}
pub async fn delete_document(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let index = ctx.index()?;
let identifier = ctx.identifier()?;
let document_id = meilidb_core::serde::compute_document_id(identifier.clone());
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut documents_deletion = index.documents_deletion();
documents_deletion.delete_document_by_id(document_id);
let update_id = documents_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
#[derive(Default, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct BrowseQuery {
offset: Option<usize>,
limit: Option<usize>,
attributes_to_retrieve: Option<String>,
}
pub async fn browse_documents(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsRead)?;
let index = ctx.index()?;
let query: BrowseQuery = ctx.url_query().unwrap_or(BrowseQuery::default());
let offset = query.offset.unwrap_or(0);
let limit = query.limit.unwrap_or(20);
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let documents_ids: Result<BTreeSet<_>, _> =
match index.documents_fields_counts.documents_ids(&reader) {
Ok(documents_ids) => documents_ids.skip(offset).take(limit).collect(),
Err(e) => return Err(ResponseError::internal(e)),
};
let documents_ids = match documents_ids {
Ok(documents_ids) => documents_ids,
Err(e) => return Err(ResponseError::internal(e)),
};
let mut response_body = Vec::<IndexMap<String, Value>>::new();
if let Some(attributes) = query.attributes_to_retrieve {
let attributes = attributes.split(',').collect::<HashSet<&str>>();
for document_id in documents_ids {
if let Ok(Some(document)) = index.document(&reader, Some(&attributes), document_id) {
response_body.push(document);
}
}
} else {
for document_id in documents_ids {
if let Ok(Some(document)) = index.document(&reader, None, document_id) {
response_body.push(document);
}
}
}
if response_body.is_empty() {
Ok(tide::response::json(response_body)
.with_status(StatusCode::NO_CONTENT)
.into_response())
} else {
Ok(tide::response::json(response_body)
.with_status(StatusCode::OK)
.into_response())
}
}
fn infered_schema(document: &IndexMap<String, Value>) -> Option<meilidb_schema::Schema> {
use meilidb_schema::{SchemaBuilder, DISPLAYED, INDEXED};
let mut identifier = None;
for key in document.keys() {
if identifier.is_none() && key.to_lowercase().contains("id") {
identifier = Some(key);
}
}
match identifier {
Some(identifier) => {
let mut builder = SchemaBuilder::with_identifier(identifier);
for key in document.keys() {
builder.new_attribute(key, DISPLAYED | INDEXED);
}
Some(builder.build())
}
None => None,
}
}
pub async fn add_or_update_multiple_documents(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let data: Vec<IndexMap<String, Value>> =
ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let current_schema = index
.main
.schema(&writer)
.map_err(ResponseError::internal)?;
if current_schema.is_none() {
match data.first().and_then(infered_schema) {
Some(schema) => {
index
.schema_update(&mut writer, schema)
.map_err(ResponseError::internal)?;
}
None => return Err(ResponseError::bad_request("Could not infer a schema")),
}
}
let mut document_addition = index.documents_addition();
for document in data {
document_addition.update_document(document);
}
let update_id = document_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn delete_multiple_documents(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let data: Vec<Value> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut documents_deletion = index.documents_deletion();
for identifier in data {
if let Some(identifier) = meilidb_core::serde::value_to_string(&identifier) {
documents_deletion
.delete_document_by_id(meilidb_core::serde::compute_document_id(identifier));
}
}
let update_id = documents_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn clear_all_documents(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let update_id = index
.clear_all(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -0,0 +1,79 @@
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::Data;
use heed::types::{Str, Unit};
use serde::Deserialize;
use tide::Context;
const UNHEALTHY_KEY: &str = "_is_unhealthy";
pub async fn get_health(ctx: Context<Data>) -> SResult<()> {
let db = &ctx.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let common_store = ctx.state().db.common_store();
if let Ok(Some(_)) = common_store.get::<Str, Unit>(&reader, UNHEALTHY_KEY) {
return Err(ResponseError::Maintenance);
}
Ok(())
}
pub async fn set_healthy(ctx: Context<Data>) -> SResult<()> {
ctx.is_allowed(Admin)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = ctx.state().db.common_store();
match common_store.delete::<Str>(&mut writer, UNHEALTHY_KEY) {
Ok(_) => (),
Err(e) => return Err(ResponseError::internal(e)),
}
if let Err(e) = writer.commit() {
return Err(ResponseError::internal(e));
}
Ok(())
}
pub async fn set_unhealthy(ctx: Context<Data>) -> SResult<()> {
ctx.is_allowed(Admin)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = ctx.state().db.common_store();
if let Err(e) = common_store.put::<Str, Unit>(&mut writer, UNHEALTHY_KEY, &()) {
return Err(ResponseError::internal(e));
}
if let Err(e) = writer.commit() {
return Err(ResponseError::internal(e));
}
Ok(())
}
#[derive(Deserialize, Clone)]
struct HealtBody {
health: bool,
}
pub async fn change_healthyness(mut ctx: Context<Data>) -> SResult<()> {
let body: HealtBody = ctx.body_json().await.map_err(ResponseError::bad_request)?;
if body.health {
set_healthy(ctx).await
} else {
set_unhealthy(ctx).await
}
}

View File

@ -0,0 +1,210 @@
use http::StatusCode;
use meilidb_core::{ProcessedUpdateResult, UpdateStatus};
use meilidb_schema::Schema;
use serde_json::json;
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::schema::SchemaBody;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
pub async fn list_indexes(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let list = ctx
.state()
.db
.indexes_names()
.map_err(ResponseError::internal)?;
Ok(tide::response::json(list))
}
pub async fn get_index_schema(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let schema = index
.main
.schema(&reader)
.map_err(ResponseError::create_index)?;
match schema {
Some(schema) => {
let schema = SchemaBody::from(schema);
Ok(tide::response::json(schema))
}
None => Ok(
tide::response::json(json!({ "message": "missing index schema" }))
.with_status(StatusCode::NOT_FOUND)
.into_response(),
),
}
}
pub async fn create_index(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesWrite)?;
let index_name = ctx.url_param("index")?;
let body = ctx.body_bytes().await.map_err(ResponseError::bad_request)?;
let schema: Option<Schema> = if body.is_empty() {
None
} else {
serde_json::from_slice::<SchemaBody>(&body)
.map_err(ResponseError::bad_request)
.map(|s| Some(s.into()))?
};
let db = &ctx.state().db;
let created_index = match db.create_index(&index_name) {
Ok(index) => index,
Err(meilidb_core::Error::IndexAlreadyExists) => db.open_index(&index_name).ok_or(
ResponseError::internal("index not found but must have been found"),
)?,
Err(e) => return Err(ResponseError::create_index(e)),
};
let callback_context = ctx.state().clone();
let callback_name = index_name.clone();
db.set_update_callback(
&index_name,
Box::new(move |status| {
index_update_callback(&callback_name, &callback_context, status);
}),
);
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
match schema {
Some(schema) => {
let update_id = created_index
.schema_update(&mut writer, schema.clone())
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::CREATED)
.into_response())
}
None => Ok(Response::new(tide::Body::empty())
.with_status(StatusCode::NO_CONTENT)
.into_response()),
}
}
pub async fn update_schema(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesWrite)?;
let index_name = ctx.url_param("index")?;
let schema = ctx
.body_json::<SchemaBody>()
.await
.map_err(ResponseError::bad_request)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let index = db
.open_index(&index_name)
.ok_or(ResponseError::index_not_found(index_name))?;
let schema: meilidb_schema::Schema = schema.into();
let update_id = index
.schema_update(&mut writer, schema.clone())
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn get_update_status(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let update_id = ctx
.param::<u64>("update_id")
.map_err(|e| ResponseError::bad_parameter("update_id", e))?;
let index = ctx.index()?;
let status = index
.update_status(&reader, update_id)
.map_err(ResponseError::internal)?;
let response = match status {
UpdateStatus::Enqueued(data) => {
tide::response::json(json!({ "status": "enqueued", "data": data }))
.with_status(StatusCode::OK)
.into_response()
}
UpdateStatus::Processed(data) => {
tide::response::json(json!({ "status": "processed", "data": data }))
.with_status(StatusCode::OK)
.into_response()
}
UpdateStatus::Unknown => tide::response::json(json!({ "message": "unknown update id" }))
.with_status(StatusCode::NOT_FOUND)
.into_response(),
};
Ok(response)
}
pub async fn get_all_updates_status(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let index = ctx.index()?;
let all_status = index
.all_updates_status(&reader)
.map_err(ResponseError::internal)?;
let response = tide::response::json(all_status)
.with_status(StatusCode::OK)
.into_response();
Ok(response)
}
pub async fn delete_index(ctx: Context<Data>) -> SResult<StatusCode> {
ctx.is_allowed(IndexesWrite)?;
let _index_name = ctx.url_param("index")?;
let _index = ctx.index()?;
// ctx.state()
// .db
// .delete_index(&index_name)
// .map_err(ResponseError::internal)?;
Ok(StatusCode::NOT_IMPLEMENTED)
}
pub fn index_update_callback(index_name: &str, data: &Data, _status: ProcessedUpdateResult) {
let env = &data.db.env;
let mut writer = env.write_txn().unwrap();
data.compute_stats(&mut writer, &index_name).unwrap();
data.set_last_update(&mut writer, &index_name).unwrap();
writer.commit().unwrap();
}

View File

@ -0,0 +1,189 @@
use chrono::serde::ts_seconds;
use chrono::{DateTime, Utc};
use heed::types::{SerdeBincode, Str};
use http::StatusCode;
use rand::seq::SliceRandom;
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::models::token::*;
use crate::Data;
fn generate_api_key() -> String {
let mut rng = rand::thread_rng();
let sample = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
sample
.choose_multiple(&mut rng, 40)
.map(|c| *c as char)
.collect()
}
pub async fn list(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let db = &ctx.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let common_store = db.common_store();
let mut response: Vec<Token> = Vec::new();
let iter = common_store
.prefix_iter::<Str, SerdeBincode<Token>>(&reader, TOKEN_PREFIX_KEY)
.map_err(ResponseError::internal)?;
for result in iter {
let (_, token) = result.map_err(ResponseError::internal)?;
response.push(token);
}
Ok(tide::response::json(response))
}
pub async fn get(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let request_key = ctx.url_param("key")?;
let db = &ctx.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, request_key);
let token_config = db
.common_store()
.get::<Str, SerdeBincode<Token>>(&reader, &token_key)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found(format!(
"token key: {}",
token_key
)))?;
Ok(tide::response::json(token_config))
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct CreatedRequest {
description: String,
acl: Vec<ACL>,
indexes: Vec<Wildcard>,
#[serde(with = "ts_seconds")]
expires_at: DateTime<Utc>,
}
pub async fn create(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let data: CreatedRequest = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let key = generate_api_key();
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, key);
let token_definition = Token {
key,
description: data.description,
acl: data.acl,
indexes: data.indexes,
expires_at: data.expires_at,
created_at: Utc::now(),
updated_at: Utc::now(),
revoked: false,
};
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
db.common_store()
.put::<Str, SerdeBincode<Token>>(&mut writer, &token_key, &token_definition)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
Ok(tide::response::json(token_definition)
.with_status(StatusCode::CREATED)
.into_response())
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct UpdatedRequest {
description: Option<String>,
acl: Option<Vec<ACL>>,
indexes: Option<Vec<Wildcard>>,
}
pub async fn update(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let request_key = ctx.url_param("key")?;
let data: UpdatedRequest = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = db.common_store();
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, request_key);
let mut token_config = common_store
.get::<Str, SerdeBincode<Token>>(&writer, &token_key)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found(format!(
"token key: {}",
token_key
)))?;
// apply the modifications
if let Some(description) = data.description {
token_config.description = description;
}
if let Some(acl) = data.acl {
token_config.acl = acl;
}
if let Some(indexes) = data.indexes {
token_config.indexes = indexes;
}
token_config.updated_at = Utc::now();
common_store
.put::<Str, SerdeBincode<Token>>(&mut writer, &token_key, &token_config)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
Ok(tide::response::json(token_config)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn delete(ctx: Context<Data>) -> SResult<StatusCode> {
ctx.is_allowed(Admin)?;
let request_key = ctx.url_param("key")?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = db.common_store();
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, request_key);
common_store
.delete::<Str>(&mut writer, &token_key)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
Ok(StatusCode::ACCEPTED)
}

View File

@ -0,0 +1,111 @@
use crate::data::Data;
pub mod document;
pub mod health;
pub mod index;
pub mod key;
pub mod search;
pub mod setting;
pub mod stats;
pub mod stop_words;
pub mod synonym;
pub fn load_routes(app: &mut tide::App<Data>) {
app.at("").nest(|router| {
router.at("/indexes").nest(|router| {
router.at("/").get(index::list_indexes);
router.at("/search").post(search::search_multi_index);
router.at("/:index").nest(|router| {
router.at("/search").get(search::search_with_url_query);
router.at("/updates").nest(|router| {
router.at("/").get(index::get_all_updates_status);
router.at("/:update_id").get(index::get_update_status);
});
router
.at("/")
.get(index::get_index_schema)
.post(index::create_index)
.put(index::update_schema)
.delete(index::delete_index);
router.at("/documents").nest(|router| {
router
.at("/")
.get(document::browse_documents)
.post(document::add_or_update_multiple_documents)
.delete(document::clear_all_documents);
router.at("/:identifier").nest(|router| {
router
.at("/")
.get(document::get_document)
.delete(document::delete_document);
});
router
.at("/delete")
.post(document::delete_multiple_documents);
});
router.at("/synonym").nest(|router| {
router.at("/").get(synonym::list).post(synonym::create);
router
.at("/:synonym")
.get(synonym::get)
.put(synonym::update)
.delete(synonym::delete);
router.at("/batch").post(synonym::batch_write);
router.at("/clear").post(synonym::clear);
});
router.at("/stop-words").nest(|router| {
router
.at("/")
.get(stop_words::list)
.put(stop_words::add)
.delete(stop_words::delete);
});
router
.at("/settings")
.get(setting::get)
.post(setting::update);
});
});
router.at("/keys").nest(|router| {
router.at("/").get(key::list).post(key::create);
router
.at("/:key")
.get(key::get)
.put(key::update)
.delete(key::delete);
});
});
// Private
app.at("").nest(|router| {
router
.at("/health")
.get(health::get_health)
.post(health::set_healthy)
.put(health::change_healthyness)
.delete(health::set_unhealthy);
router.at("/stats").get(stats::get_stats);
router.at("/stats/:index").get(stats::index_stat);
router.at("/version").get(stats::get_version);
router.at("/sys-info").get(stats::get_sys_info);
router
.at("/sys-info/pretty")
.get(stats::get_sys_info_pretty);
});
}

View File

@ -0,0 +1,231 @@
use std::collections::HashMap;
use std::collections::HashSet;
use std::time::Duration;
use meilidb_core::Index;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use serde::{Deserialize, Serialize};
use tide::querystring::ContextExt as QSContextExt;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::meilidb::{Error, IndexSearchExt, SearchHit};
use crate::helpers::tide::ContextExt;
use crate::Data;
#[derive(Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct SearchQuery {
q: String,
offset: Option<usize>,
limit: Option<usize>,
attributes_to_retrieve: Option<String>,
attributes_to_search_in: Option<String>,
attributes_to_crop: Option<String>,
crop_length: Option<usize>,
attributes_to_highlight: Option<String>,
filters: Option<String>,
timeout_ms: Option<u64>,
matches: Option<bool>,
}
pub async fn search_with_url_query(ctx: Context<Data>) -> SResult<Response> {
// ctx.is_allowed(DocumentsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let query: SearchQuery = ctx
.url_query()
.map_err(|_| ResponseError::bad_request("invalid query parameter"))?;
let mut search_builder = index.new_search(query.q.clone());
if let Some(offset) = query.offset {
search_builder.offset(offset);
}
if let Some(limit) = query.limit {
search_builder.limit(limit);
}
if let Some(attributes_to_retrieve) = query.attributes_to_retrieve {
for attr in attributes_to_retrieve.split(',') {
search_builder.add_retrievable_field(attr.to_string());
}
}
if let Some(attributes_to_search_in) = query.attributes_to_search_in {
for attr in attributes_to_search_in.split(',') {
search_builder.add_retrievable_field(attr.to_string());
}
}
if let Some(attributes_to_crop) = query.attributes_to_crop {
let crop_length = query.crop_length.unwrap_or(200);
let attributes_to_crop = attributes_to_crop
.split(',')
.map(|r| (r.to_string(), crop_length))
.collect();
search_builder.attributes_to_crop(attributes_to_crop);
}
if let Some(attributes_to_highlight) = query.attributes_to_highlight {
let attributes_to_highlight = attributes_to_highlight
.split(',')
.map(ToString::to_string)
.collect();
search_builder.attributes_to_highlight(attributes_to_highlight);
}
if let Some(filters) = query.filters {
search_builder.filters(filters);
}
if let Some(timeout_ms) = query.timeout_ms {
search_builder.timeout(Duration::from_millis(timeout_ms));
}
if let Some(matches) = query.matches {
if matches {
search_builder.get_matches();
}
}
let response = match search_builder.search(&reader) {
Ok(response) => response,
Err(Error::Internal(message)) => return Err(ResponseError::Internal(message)),
Err(others) => return Err(ResponseError::bad_request(others)),
};
Ok(tide::response::json(response))
}
#[derive(Clone, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct SearchMultiBody {
indexes: HashSet<String>,
query: String,
offset: Option<usize>,
limit: Option<usize>,
attributes_to_retrieve: Option<HashSet<String>>,
attributes_to_search_in: Option<HashSet<String>>,
attributes_to_crop: Option<HashMap<String, usize>>,
attributes_to_highlight: Option<HashSet<String>>,
filters: Option<String>,
timeout_ms: Option<u64>,
matches: Option<bool>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
struct SearchMultiBodyResponse {
hits: HashMap<String, Vec<SearchHit>>,
offset: usize,
hits_per_page: usize,
processing_time_ms: usize,
query: String,
}
pub async fn search_multi_index(mut ctx: Context<Data>) -> SResult<Response> {
// ctx.is_allowed(DocumentsRead)?;
let body = ctx
.body_json::<SearchMultiBody>()
.await
.map_err(ResponseError::bad_request)?;
let mut index_list = body.clone().indexes;
for index in index_list.clone() {
if index == "*" {
index_list = ctx
.state()
.db
.indexes_names()
.map_err(ResponseError::internal)?
.into_iter()
.collect();
}
}
let mut offset = 0;
let mut count = 20;
if let Some(body_offset) = body.offset {
if let Some(limit) = body.limit {
offset = body_offset;
count = limit;
}
}
let offset = offset;
let count = count;
let db = &ctx.state().db;
let par_body = body.clone();
let responses_per_index: Vec<SResult<_>> = index_list
.into_par_iter()
.map(move |index_name| {
let index: Index = db
.open_index(&index_name)
.ok_or(ResponseError::index_not_found(&index_name))?;
let mut search_builder = index.new_search(par_body.query.clone());
search_builder.offset(offset);
search_builder.limit(count);
if let Some(attributes_to_retrieve) = par_body.attributes_to_retrieve.clone() {
search_builder.attributes_to_retrieve(attributes_to_retrieve);
}
if let Some(attributes_to_search_in) = par_body.attributes_to_search_in.clone() {
search_builder.attributes_to_search_in(attributes_to_search_in);
}
if let Some(attributes_to_crop) = par_body.attributes_to_crop.clone() {
search_builder.attributes_to_crop(attributes_to_crop);
}
if let Some(attributes_to_highlight) = par_body.attributes_to_highlight.clone() {
search_builder.attributes_to_highlight(attributes_to_highlight);
}
if let Some(filters) = par_body.filters.clone() {
search_builder.filters(filters);
}
if let Some(timeout_ms) = par_body.timeout_ms {
search_builder.timeout(Duration::from_secs(timeout_ms));
}
if let Some(matches) = par_body.matches {
if matches {
search_builder.get_matches();
}
}
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let response = search_builder
.search(&reader)
.map_err(ResponseError::internal)?;
Ok((index_name, response))
})
.collect();
let mut hits_map = HashMap::new();
let mut max_query_time = 0;
for response in responses_per_index {
if let Ok((index_name, response)) = response {
if response.processing_time_ms > max_query_time {
max_query_time = response.processing_time_ms;
}
hits_map.insert(index_name, response.hits);
}
}
let response = SearchMultiBodyResponse {
hits: hits_map,
offset,
hits_per_page: count,
processing_time_ms: max_query_time,
query: body.query,
};
Ok(tide::response::json(response))
}

View File

@ -0,0 +1,93 @@
use std::collections::{HashMap, HashSet};
use http::StatusCode;
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct SettingBody {
pub stop_words: Option<StopWords>,
pub ranking_order: Option<RankingOrder>,
pub distinct_field: Option<DistinctField>,
pub ranking_rules: Option<RankingRules>,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RankingOrdering {
Asc,
Dsc,
}
pub type StopWords = HashSet<String>;
pub type RankingOrder = Vec<String>;
pub type DistinctField = String;
pub type RankingRules = HashMap<String, RankingOrdering>;
pub async fn get(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let settings = match index.main.customs(&reader).unwrap() {
Some(bytes) => bincode::deserialize(bytes).unwrap(),
None => SettingBody::default(),
};
Ok(tide::response::json(settings))
}
pub async fn update(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let settings: SettingBody = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut current_settings = match index.main.customs(&writer).unwrap() {
Some(bytes) => bincode::deserialize(bytes).unwrap(),
None => SettingBody::default(),
};
if let Some(stop_words) = settings.stop_words {
current_settings.stop_words = Some(stop_words);
}
if let Some(ranking_order) = settings.ranking_order {
current_settings.ranking_order = Some(ranking_order);
}
if let Some(distinct_field) = settings.distinct_field {
current_settings.distinct_field = Some(distinct_field);
}
if let Some(ranking_rules) = settings.ranking_rules {
current_settings.ranking_rules = Some(ranking_rules);
}
let bytes = bincode::serialize(&current_settings).unwrap();
let update_id = index
.customs_update(&mut writer, bytes)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -0,0 +1,334 @@
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use pretty_bytes::converter::convert;
use serde::Serialize;
use sysinfo::{NetworkExt, Pid, ProcessExt, ProcessorExt, System, SystemExt};
use tide::{Context, Response};
use walkdir::WalkDir;
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::Data;
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct IndexStatsResponse {
number_of_documents: u64,
is_indexing: bool,
last_update: Option<DateTime<Utc>>,
fields_frequency: HashMap<String, usize>,
}
pub async fn index_stat(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let index_name = ctx.url_param("index")?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let number_of_documents = index
.main
.number_of_documents(&reader)
.map_err(ResponseError::internal)?;
let fields_frequency = ctx
.state()
.fields_frequency(&reader, &index_name)
.map_err(ResponseError::internal)?
.unwrap_or_default();
let is_indexing = ctx
.state()
.is_indexing(&reader, &index_name)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found("Index not found"))?;
let last_update = ctx
.state()
.last_update(&reader, &index_name)
.map_err(ResponseError::internal)?;
let response = IndexStatsResponse {
number_of_documents,
is_indexing,
last_update,
fields_frequency,
};
Ok(tide::response::json(response))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct StatsResult {
database_size: u64,
indexes: HashMap<String, IndexStatsResponse>,
}
pub async fn get_stats(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let mut index_list = HashMap::new();
if let Ok(indexes_set) = ctx.state().db.indexes_names() {
for index_name in indexes_set {
let db = &ctx.state().db;
let env = &db.env;
let index = db.open_index(&index_name).unwrap();
let reader = env.read_txn().map_err(ResponseError::internal)?;
let number_of_documents = index
.main
.number_of_documents(&reader)
.map_err(ResponseError::internal)?;
let fields_frequency = ctx
.state()
.fields_frequency(&reader, &index_name)
.map_err(ResponseError::internal)?
.unwrap_or_default();
let is_indexing = ctx
.state()
.is_indexing(&reader, &index_name)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found("Index not found"))?;
let last_update = ctx
.state()
.last_update(&reader, &index_name)
.map_err(ResponseError::internal)?;
let response = IndexStatsResponse {
number_of_documents,
is_indexing,
last_update,
fields_frequency,
};
index_list.insert(index_name, response);
}
}
let database_size = WalkDir::new(ctx.state().db_path.clone())
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.filter(|metadata| metadata.is_file())
.fold(0, |acc, m| acc + m.len());
let response = StatsResult {
database_size,
indexes: index_list,
};
Ok(tide::response::json(response))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct VersionResponse {
commit_sha: String,
build_date: String,
pkg_version: String,
}
pub async fn get_version(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let response = VersionResponse {
commit_sha: env!("VERGEN_SHA").to_string(),
build_date: env!("VERGEN_BUILD_TIMESTAMP").to_string(),
pkg_version: env!("CARGO_PKG_VERSION").to_string(),
};
Ok(tide::response::json(response))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysGlobal {
total_memory: u64,
used_memory: u64,
total_swap: u64,
used_swap: u64,
input_data: u64,
output_data: u64,
}
impl SysGlobal {
fn new() -> SysGlobal {
SysGlobal {
total_memory: 0,
used_memory: 0,
total_swap: 0,
used_swap: 0,
input_data: 0,
output_data: 0,
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysProcess {
memory: u64,
cpu: f32,
}
impl SysProcess {
fn new() -> SysProcess {
SysProcess {
memory: 0,
cpu: 0.0,
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysInfo {
memory_usage: f64,
processor_usage: Vec<f32>,
global: SysGlobal,
process: SysProcess,
}
impl SysInfo {
fn new() -> SysInfo {
SysInfo {
memory_usage: 0.0,
processor_usage: Vec::new(),
global: SysGlobal::new(),
process: SysProcess::new(),
}
}
}
pub(crate) fn report(pid: Pid) -> SysInfo {
let mut sys = System::new();
let mut info = SysInfo::new();
info.memory_usage = sys.get_used_memory() as f64 / sys.get_total_memory() as f64 * 100.0;
for processor in sys.get_processor_list() {
info.processor_usage.push(processor.get_cpu_usage() * 100.0);
}
info.global.total_memory = sys.get_total_memory();
info.global.used_memory = sys.get_used_memory();
info.global.total_swap = sys.get_total_swap();
info.global.used_swap = sys.get_used_swap();
info.global.input_data = sys.get_network().get_income();
info.global.output_data = sys.get_network().get_outcome();
if let Some(process) = sys.get_process(pid) {
info.process.memory = process.memory();
info.process.cpu = process.cpu_usage() * 100.0;
}
sys.refresh_all();
info
}
pub async fn get_sys_info(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
Ok(tide::response::json(report(ctx.state().server_pid)))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysGlobalPretty {
total_memory: String,
used_memory: String,
total_swap: String,
used_swap: String,
input_data: String,
output_data: String,
}
impl SysGlobalPretty {
fn new() -> SysGlobalPretty {
SysGlobalPretty {
total_memory: "None".to_owned(),
used_memory: "None".to_owned(),
total_swap: "None".to_owned(),
used_swap: "None".to_owned(),
input_data: "None".to_owned(),
output_data: "None".to_owned(),
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysProcessPretty {
memory: String,
cpu: String,
}
impl SysProcessPretty {
fn new() -> SysProcessPretty {
SysProcessPretty {
memory: "None".to_owned(),
cpu: "None".to_owned(),
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysInfoPretty {
memory_usage: String,
processor_usage: Vec<String>,
global: SysGlobalPretty,
process: SysProcessPretty,
}
impl SysInfoPretty {
fn new() -> SysInfoPretty {
SysInfoPretty {
memory_usage: "None".to_owned(),
processor_usage: Vec::new(),
global: SysGlobalPretty::new(),
process: SysProcessPretty::new(),
}
}
}
pub(crate) fn report_pretty(pid: Pid) -> SysInfoPretty {
let mut sys = System::new();
let mut info = SysInfoPretty::new();
info.memory_usage = format!(
"{:.1} %",
sys.get_used_memory() as f64 / sys.get_total_memory() as f64 * 100.0
);
for processor in sys.get_processor_list() {
info.processor_usage
.push(format!("{:.1} %", processor.get_cpu_usage() * 100.0));
}
info.global.total_memory = convert(sys.get_total_memory() as f64 * 1024.0);
info.global.used_memory = convert(sys.get_used_memory() as f64 * 1024.0);
info.global.total_swap = convert(sys.get_total_swap() as f64 * 1024.0);
info.global.used_swap = convert(sys.get_used_swap() as f64 * 1024.0);
info.global.input_data = convert(sys.get_network().get_income() as f64);
info.global.output_data = convert(sys.get_network().get_outcome() as f64);
if let Some(process) = sys.get_process(pid) {
info.process.memory = convert(process.memory() as f64 * 1024.0);
info.process.cpu = format!("{:.1} %", process.cpu_usage() * 100.0);
}
sys.refresh_all();
info
}
pub async fn get_sys_info_pretty(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
Ok(tide::response::json(report_pretty(ctx.state().server_pid)))
}

View File

@ -0,0 +1,82 @@
use http::StatusCode;
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
pub async fn list(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let stop_words_fst = index
.main
.stop_words_fst(&reader)
.map_err(ResponseError::internal)?;
let stop_words = stop_words_fst
.unwrap_or_default()
.stream()
.into_strs()
.map_err(ResponseError::internal)?;
Ok(tide::response::json(stop_words))
}
pub async fn add(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let data: Vec<String> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut stop_words_addition = index.stop_words_addition();
for stop_word in data {
stop_words_addition.add_stop_word(stop_word);
}
let update_id = stop_words_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn delete(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let data: Vec<String> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut stop_words_deletion = index.stop_words_deletion();
for stop_word in data {
stop_words_deletion.delete_stop_word(stop_word);
}
let update_id = stop_words_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -0,0 +1,235 @@
use std::collections::HashMap;
use http::StatusCode;
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
#[derive(Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonym {
OneWay(SynonymOneWay),
MultiWay { synonyms: Vec<String> },
}
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct SynonymOneWay {
pub input: String,
pub synonyms: Vec<String>,
}
pub type Synonyms = Vec<Synonym>;
pub async fn list(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let synonyms_fst = index
.main
.synonyms_fst(&reader)
.map_err(ResponseError::internal)?;
let synonyms_fst = synonyms_fst.unwrap_or_default();
let synonyms_list = synonyms_fst.stream().into_strs().unwrap();
let mut response = HashMap::new();
let index_synonyms = &index.synonyms;
for synonym in synonyms_list {
let alternative_list = index_synonyms
.synonyms(&reader, synonym.as_bytes())
.unwrap()
.unwrap()
.stream()
.into_strs()
.unwrap();
response.insert(synonym, alternative_list);
}
Ok(tide::response::json(response))
}
pub async fn get(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let synonym = ctx.url_param("synonym")?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let synonym_list = index
.synonyms
.synonyms(&reader, synonym.as_bytes())
.unwrap()
.unwrap()
.stream()
.into_strs()
.unwrap();
Ok(tide::response::json(synonym_list))
}
pub async fn create(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let data: Synonym = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_addition = index.synonyms_addition();
match data.clone() {
Synonym::OneWay(content) => {
synonyms_addition.add_synonym(content.input, content.synonyms.into_iter())
}
Synonym::MultiWay { mut synonyms } => {
if synonyms.len() > 1 {
for _ in 0..synonyms.len() {
let (first, elems) = synonyms.split_first().unwrap();
synonyms_addition.add_synonym(first, elems.iter());
synonyms.rotate_left(1);
}
}
}
}
let update_id = synonyms_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::CREATED)
.into_response())
}
pub async fn update(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let synonym = ctx.url_param("synonym")?;
let index = ctx.index()?;
let data: Vec<String> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_addition = index.synonyms_addition();
synonyms_addition.add_synonym(synonym.clone(), data.clone().into_iter());
let update_id = synonyms_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn delete(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let synonym = ctx.url_param("synonym")?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_deletion = index.synonyms_deletion();
synonyms_deletion.delete_all_alternatives_of(synonym);
let update_id = synonyms_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn batch_write(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let data: Synonyms = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_addition = index.synonyms_addition();
for raw in data {
match raw {
Synonym::OneWay(content) => {
synonyms_addition.add_synonym(content.input, content.synonyms.into_iter())
}
Synonym::MultiWay { mut synonyms } => {
if synonyms.len() > 1 {
for _ in 0..synonyms.len() {
let (first, elems) = synonyms.split_first().unwrap();
synonyms_addition.add_synonym(first, elems.iter());
synonyms.rotate_left(1);
}
}
}
}
}
let update_id = synonyms_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn clear(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let synonyms_fst = index
.main
.synonyms_fst(&writer)
.map_err(ResponseError::internal)?;
let synonyms_fst = synonyms_fst.unwrap_or_default();
let synonyms_list = synonyms_fst.stream().into_strs().unwrap();
let mut synonyms_deletion = index.synonyms_deletion();
for synonym in synonyms_list {
synonyms_deletion.delete_all_alternatives_of(synonym);
}
let update_id = synonyms_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -1,6 +1,6 @@
[package]
name = "meilidb-schema"
version = "0.1.0"
version = "0.6.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"

View File

@ -22,7 +22,7 @@ pub const RANKED: SchemaProps = SchemaProps {
ranked: true,
};
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
#[serde(default)]
pub displayed: bool,
@ -60,6 +60,36 @@ impl BitOr for SchemaProps {
}
}
impl fmt::Debug for SchemaProps {
#[allow(non_camel_case_types)]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
#[derive(Debug)]
struct DISPLAYED;
#[derive(Debug)]
struct INDEXED;
#[derive(Debug)]
struct RANKED;
let mut debug_set = f.debug_set();
if self.displayed {
debug_set.entry(&DISPLAYED);
}
if self.indexed {
debug_set.entry(&INDEXED);
}
if self.ranked {
debug_set.entry(&RANKED);
}
debug_set.finish()
}
}
#[derive(Serialize, Deserialize)]
pub struct SchemaBuilder {
identifier: String,
@ -102,12 +132,12 @@ impl SchemaBuilder {
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Clone, PartialEq, Eq)]
pub struct Schema {
inner: Arc<InnerSchema>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Clone, PartialEq, Eq)]
struct InnerSchema {
identifier: String,
attrs: HashMap<String, SchemaAttr>,
@ -139,6 +169,10 @@ impl Schema {
attributes
}
pub fn number_of_attributes(&self) -> usize {
self.inner.attrs.len()
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let (_, props) = self.inner.props[attr.0 as usize];
props
@ -184,6 +218,16 @@ impl<'de> Deserialize<'de> for Schema {
}
}
impl fmt::Debug for Schema {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let builder = self.to_builder();
f.debug_struct("Schema")
.field("identifier", &builder.identifier)
.field("attributes", &builder.attributes)
.finish()
}
}
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub u16);
@ -215,11 +259,155 @@ impl fmt::Display for SchemaAttr {
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Diff {
IdentChange {
old: String,
new: String,
},
AttrMove {
name: String,
old: usize,
new: usize,
},
AttrPropsChange {
name: String,
old: SchemaProps,
new: SchemaProps,
},
NewAttr {
name: String,
pos: usize,
props: SchemaProps,
},
RemovedAttr {
name: String,
},
}
pub fn diff(old: &Schema, new: &Schema) -> Vec<Diff> {
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
let mut differences = Vec::new();
let old = old.to_builder();
let new = new.to_builder();
// check if the old identifier differs from the new one
if old.identifier != new.identifier {
let old = old.identifier;
let new = new.identifier;
differences.push(IdentChange { old, new });
}
// compare all old attributes positions
// and properties with the new ones
for (pos, (name, props)) in old.attributes.iter().enumerate() {
match new.attributes.get_full(name) {
Some((npos, _, nprops)) => {
if pos != npos {
let name = name.clone();
differences.push(AttrMove {
name,
old: pos,
new: npos,
});
}
if props != nprops {
let name = name.clone();
differences.push(AttrPropsChange {
name,
old: *props,
new: *nprops,
});
}
}
None => differences.push(RemovedAttr { name: name.clone() }),
}
}
// retrieve all attributes that
// were not present in the old schema
for (pos, (name, props)) in new.attributes.iter().enumerate() {
if !old.attributes.contains_key(name) {
let name = name.clone();
differences.push(NewAttr {
name,
pos,
props: *props,
});
}
}
differences
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn difference() {
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
builder.new_attribute("omega", INDEXED);
let old = builder.build();
let mut builder = SchemaBuilder::with_identifier("kiki");
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("alpha", DISPLAYED | INDEXED);
builder.new_attribute("delta", RANKED);
builder.new_attribute("gamma", DISPLAYED);
let new = builder.build();
let differences = diff(&old, &new);
let expected = &[
IdentChange {
old: format!("id"),
new: format!("kiki"),
},
AttrMove {
name: format!("alpha"),
old: 0,
new: 1,
},
AttrPropsChange {
name: format!("alpha"),
old: DISPLAYED,
new: DISPLAYED | INDEXED,
},
AttrMove {
name: format!("beta"),
old: 1,
new: 0,
},
AttrMove {
name: format!("gamma"),
old: 2,
new: 3,
},
AttrPropsChange {
name: format!("gamma"),
old: INDEXED,
new: DISPLAYED,
},
RemovedAttr {
name: format!("omega"),
},
NewAttr {
name: format!("delta"),
pos: 2,
props: RANKED,
},
];
assert_eq!(&differences, expected)
}
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::with_identifier("id");
@ -303,4 +491,43 @@ mod tests {
Ok(())
}
#[test]
fn debug_output() {
use std::fmt::Write as _;
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut output = String::new();
let _ = write!(&mut output, "{:#?}", schema);
let expected = r#"Schema {
identifier: "id",
attributes: {
"alpha": {
DISPLAYED,
},
"beta": {
DISPLAYED,
INDEXED,
},
"gamma": {
INDEXED,
},
},
}"#;
assert_eq!(output, expected);
let mut output = String::new();
let _ = write!(&mut output, "{:?}", schema);
let expected = r#"Schema { identifier: "id", attributes: {"alpha": {DISPLAYED}, "beta": {DISPLAYED, INDEXED}, "gamma": {INDEXED}} }"#;
assert_eq!(output, expected);
}
}

View File

@ -1,8 +1,9 @@
[package]
name = "meilidb-tokenizer"
version = "0.1.0"
version = "0.6.1"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
deunicode = "1.0.0"
slice-group-by = "0.2.4"

View File

@ -1,4 +1,5 @@
use self::SeparatorCategory::*;
use deunicode::deunicode_char;
use slice_group_by::StrGroupBy;
use std::iter::Peekable;
@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool {
fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c {
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
c if c.is_whitespace() => Some(Soft), // whitespaces
c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
'-' | '_' | '\'' | ':' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
_ => None,
}