Compare commits

...

87 Commits

Author SHA1 Message Date
026464b2e4 Bump meilidb-core to v0.6.5 2019-11-06 11:52:34 +01:00
bd42158a70 Merge pull request #264 from meilisearch/index-soft-deletion
Index soft deletion
2019-11-06 11:51:50 +01:00
df066f4321 Introduce a new add or update documents PUT route 2019-11-06 11:42:41 +01:00
69832e8c70 Update the http index deletion route 2019-11-06 11:42:41 +01:00
95eb6ad09a Add a test to check index soft deletion works correctly 2019-11-06 11:02:30 +01:00
f3fc0bed45 Introduce index soft deletion 2019-11-06 11:02:30 +01:00
5dd6b697b9 Bump meilidb-core to v0.6.4 2019-11-05 18:46:16 +01:00
b7d170c7d1 Merge pull request #262 from meilisearch/fix-unidecoded-emojis
Fix an highlighting problem
2019-11-05 17:04:35 +01:00
7541172d12 Make the example show highlighted areas more explicitly 2019-11-05 16:40:48 +01:00
85bf5d113c Fix an highlighting problem when query was longer than original text 2019-11-05 16:40:34 +01:00
89fd397903 Bump meilidb-core to v0.6.3 2019-11-05 15:40:04 +01:00
d8392f2f18 Merge pull request #261 from meilisearch/partial-updates
Introduce the support of partial updates
2019-11-05 15:39:02 +01:00
36b74f0efe Introduce partial updates to the update system 2019-11-05 15:23:41 +01:00
68c0a36b00 Make the deserialization support correctly optional documents 2019-11-05 15:03:18 +01:00
a127b72a74 Merge pull request #259 from meilisearch/allow-add-schema-attributes-at-end
Allow to introduce attributes only at the end of a schema
2019-11-05 12:34:11 +01:00
5782fb9e52 Test the add of attributes only at the end of a schema 2019-11-05 12:09:52 +01:00
20319f7974 Allow to introduce attributes only at the end of a schema 2019-11-05 12:09:52 +01:00
c4087e2ec2 Merge pull request #258 from meilisearch/debug-schema
Implement a better debug for the schema
2019-11-05 11:35:02 +01:00
b1d1f2f627 Implement a better debug system for the schema 2019-11-05 11:21:07 +01:00
62fe6a8263 Merge pull request #257 from meilisearch/bump-version
Bump meilidb-core/tokenizer versions
2019-11-04 17:26:01 +01:00
d88c10f3b4 Bump meilidb-tokenizer to v0.6.1 2019-11-04 17:17:06 +01:00
00f49990c7 Bump meilidb-core to v0.6.2 2019-11-04 17:16:50 +01:00
89f30ad47e Merge pull request #256 from meilisearch/fix-tokenizer
Fix the tokenizer to make it work with unicode chars
2019-11-04 17:15:17 +01:00
3b1cbed238 Check that the unidecoded words are not empty 2019-11-04 17:03:11 +01:00
4571b80a49 Update the tests 2019-11-04 16:41:58 +01:00
de2b8672d4 Make the tokenizer understand strange whitespaces/quotes 2019-11-04 16:41:58 +01:00
ccded7b429 Improve the indexer to not not deunicode before indexing
Revert of #179
2019-11-04 16:41:58 +01:00
1d4e98410a Merge pull request #255 from meilisearch/bump-version
Bump meilidb-core to v0.6.1
2019-11-04 14:47:53 +01:00
e493b27ef1 Bump meilidb-core to v0.6.1 2019-11-04 14:22:08 +01:00
70589c136f Merge pull request #253 from meilisearch/fix-updates-system
Fix the updates system
2019-11-04 13:46:37 +01:00
1c3620a7d4 Add tests to the update system 2019-11-04 13:18:07 +01:00
c2cc0704d7 Clean up the update_awaiter function 2019-11-04 11:11:58 +01:00
2a50e08bb8 Moving to heed v0.5.0 2019-11-04 10:49:27 +01:00
6b326a45d7 Fix the update system to always consume updates even if failing 2019-10-31 17:44:13 +01:00
b73874bf24 Merge pull request #252 from meilisearch/examples-specify-index-name
Allow users to specify the index name to use with examples bins
2019-10-31 17:02:00 +01:00
95c8ad0f80 Allow users to specify the index name to use with examples bins 2019-10-31 16:20:31 +01:00
996763cc52 Merge pull request #251 from meilisearch/update-heed
Moving to heed 0.3.0
2019-10-31 16:20:07 +01:00
6a8171d335 Moving to heed 0.3.0 2019-10-31 16:11:02 +01:00
2f32586dab Merge pull request #250 from meilisearch/new-http-server
Introduce a brand new HTTP server
2019-10-31 16:07:52 +01:00
db898001eb Get rid of rust-crypto and uuid 2019-10-31 15:28:37 +01:00
c2a12b661a Make it a runnable server 2019-10-31 15:27:21 +01:00
f51c49db93 Introduce the HTTP tide based library 2019-10-31 15:02:34 +01:00
1be5b0f327 Bump the meili-core/schema/tokenizer crates to 0.6.0 2019-10-31 14:05:59 +01:00
a136c62208 Merge pull request #249 from meilisearch/display-all-updates
Display enqueued along with processed updates
2019-10-31 13:53:46 +01:00
cc461b1331 Display enqueued along with processed updates 2019-10-31 12:25:52 +01:00
dbe5363672 Merge pull request #248 from meilisearch/fix-highlight-too-long
Correctly highlight when query string is too long
2019-10-30 18:19:06 +01:00
45d4361e7d Correctly highlight when query string is longer 2019-10-30 17:49:50 +01:00
b28c44cc6b Merge pull request #247 from meilisearch/bump-meilidb
Bump the meili-core/schema/tokenizer crates to 0.5.11
2019-10-30 17:48:26 +01:00
b709a7a30a Bump the meili-core/schema/tokenizer crates to 0.5.11 2019-10-30 17:40:31 +01:00
64c25bdb40 Merge pull request #246 from meilisearch/better-highlighting-area
Make the highlight system much better
2019-10-30 17:39:12 +01:00
c230f244be Make the highlight system much better 2019-10-30 17:32:29 +01:00
02af4ff113 Merge pull request #245 from meilisearch/reindex-all-documents-reduce-memory-usage
Reduce the ram consumption when re-indexing all the documents
2019-10-29 17:54:47 +01:00
4dff8a215e Reduce the ram consumption when re-indexing all the documents 2019-10-29 17:46:23 +01:00
41065305aa Merge pull request #244 from meilisearch/reintroduce-stop-words
Reintroduce stop words
2019-10-29 16:35:03 +01:00
e9dce3ce81 Add a test to ensure that the indexer support stop words 2019-10-29 16:18:06 +01:00
ff7dde7522 Make the RawIndexer support stop words 2019-10-29 16:18:06 +01:00
a226fd23c3 Introduce the stop words deletion update type 2019-10-29 16:18:06 +01:00
776673ebae Introduce the stop words addition update type 2019-10-29 15:24:09 +01:00
32d2cc3aea Merge pull request #243 from meilisearch/all-updates-results
Introduce a function to get all updates results
2019-10-29 11:45:55 +01:00
8a17fcdda5 Introduce a function to get all updates results 2019-10-29 11:37:40 +01:00
9602d7a960 Merge pull request #242 from meilisearch/accept-dup-documents
Make documents additions accept only the last duplicate document
2019-10-28 20:52:40 +01:00
ac12a4b9c9 Make documents additions accept only the last duplicate document 2019-10-28 20:40:33 +01:00
af96050944 Merge pull request #241 from meilisearch/fix-dead-locks
Fix dead locks
2019-10-28 18:20:01 +01:00
a43b37dfc1 Send channel notification when clearing documents 2019-10-28 17:58:22 +01:00
c08dcac1d4 Abort the update transaction before calling the update callback 2019-10-28 17:55:43 +01:00
a17dccd84e Merge pull request #237 from meilisearch/fix-exactness-criterion
Fix the exactness criterion algorithm
2019-10-26 18:43:10 +02:00
9a57cab3ee Fix the exactness criterion algorithm 2019-10-26 18:34:40 +02:00
751b060320 Merge pull request #238 from meilisearch/improve-highlighting
Only highlight query words areas not the whole words
2019-10-26 18:23:19 +02:00
4111b99a6d Only highlight query words areas not the whole words 2019-10-26 15:56:34 +02:00
d6fb2b56d1 Merge pull request #236 from meilisearch/reorder-automatons
Make sure that automatons group with more automatons are better
2019-10-24 15:29:16 +02:00
cb5c77e536 Make sure that automatons group with more automatons are better 2019-10-24 15:18:53 +02:00
44c89b1ea2 Merge pull request #235 from meilisearch/readme-concat-split-query-words
Add information about search concat and split query words support
2019-10-23 18:20:59 +02:00
26a285053b Add information about search concat and split query words support 2019-10-23 18:19:15 +02:00
1446a6a2d2 Merge pull request #234 from meilisearch/clear-all-update-variant
Introduce a clear all documents update
2019-10-23 16:45:37 +02:00
047eba3ff3 Introduce a clear all documents update 2019-10-23 16:39:10 +02:00
8d9d183ce6 Merge pull request #233 from meilisearch/commit-when-update-ok
Commit an update only when it is Ok
2019-10-23 16:07:48 +02:00
eb67195840 Commit an update only when it is Ok 2019-10-23 15:52:40 +02:00
93306c2326 Merge pull request #232 from meilisearch/support-splitted-words
Support splitted words
2019-10-23 13:38:16 +02:00
7d9cf8d713 Clean up the fetch algorithm 2019-10-23 12:06:21 +02:00
03eb7898e7 Introduce a basic working version of phrase query for splitting words 2019-10-23 11:40:13 +02:00
0fbd4cd632 Merge pull request #231 from meilisearch/recursive-object-indexing
Make possible to convert recursive object into strings
2019-10-22 16:20:10 +02:00
858bf359b8 Make possible to convert recursive object into strings 2019-10-22 16:02:02 +02:00
5dc8465ebd Merge pull request #181 from meilisearch/diff-schema
Make possible to update an index schema
2019-10-22 14:23:43 +02:00
0f30a221fa Introduce the reindex_all_documents indexing function 2019-10-22 14:07:27 +02:00
e86a547e93 Introduce a basic schema diff function 2019-10-21 17:57:32 +02:00
32d8b4b83f Merge pull request #230 from meilisearch/moving-to-heed
Move to heed 0.1.0
2019-10-21 13:34:06 +02:00
78535b3e33 Move to heed 0.1.0 2019-10-21 12:05:53 +02:00
64 changed files with 5811 additions and 593 deletions

View File

@ -1,6 +1,7 @@
[workspace] [workspace]
members = [ members = [
"meilidb-core", "meilidb-core",
"meilidb-http",
"meilidb-schema", "meilidb-schema",
"meilidb-tokenizer", "meilidb-tokenizer",
] ]

View File

@ -12,6 +12,7 @@ A _full-text search database_ based on the fast [LMDB key-value store](https://e
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order - Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L283), useful for paginating results - Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L283), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L265-L270) and [filter](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L246-L259) returned documents based on context defined rules - Can [distinct](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L265-L270) and [filter](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L246-L259) returned documents based on context defined rules
- Searches for [concatenated](https://github.com/meilisearch/MeiliDB/pull/164) and [splitted query words](https://github.com/meilisearch/MeiliDB/pull/232) to improve the search quality.
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-schema/src/lib.rs#L265-L279) - Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-schema/src/lib.rs#L265-L279)
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages - The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results - Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results

View File

@ -1,6 +1,6 @@
[package] [package]
name = "meilidb-core" name = "meilidb-core"
version = "0.1.0" version = "0.6.5"
authors = ["Kerollmops <clement@meilisearch.com>"] authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018" edition = "2018"
@ -12,9 +12,10 @@ crossbeam-channel = "0.3.9"
deunicode = "1.0.0" deunicode = "1.0.0"
env_logger = "0.7.0" env_logger = "0.7.0"
hashbrown = { version = "0.6.0", features = ["serde"] } hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.5.0"
log = "0.4.8" log = "0.4.8"
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" } meilidb-schema = { path = "../meilidb-schema", version = "0.6.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.6.0" }
once_cell = "1.2.0" once_cell = "1.2.0"
ordered-float = { version = "1.0.2", features = ["serde"] } ordered-float = { version = "1.0.2", features = ["serde"] }
sdset = "0.3.3" sdset = "0.3.3"
@ -24,11 +25,6 @@ siphasher = "0.3.0"
slice-group-by = "0.2.6" slice-group-by = "0.2.6"
zerocopy = "0.2.8" zerocopy = "0.2.8"
[dependencies.zlmdb]
package = "zerocopy-lmdb"
git = "https://github.com/Kerollmops/zerocopy-lmdb.git"
branch = "master"
[dependencies.levenshtein_automata] [dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git" git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice" branch = "arc-byte-slice"

View File

@ -12,17 +12,18 @@ use serde::{Deserialize, Serialize};
use structopt::StructOpt; use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::{Database, Highlight, UpdateResult}; use meilidb_core::{Database, Highlight, ProcessedUpdateResult};
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
const INDEX_NAME: &str = "default";
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
struct IndexCommand { struct IndexCommand {
/// The destination where the database must be created. /// The destination where the database must be created.
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
database_path: PathBuf, database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_name: String,
/// The csv file to index. /// The csv file to index.
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
csv_data_path: PathBuf, csv_data_path: PathBuf,
@ -40,10 +41,13 @@ struct IndexCommand {
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
struct SearchCommand { struct SearchCommand {
/// The destination where the database must be created. /// The path of the database to work with.
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
database_path: PathBuf, database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_name: String,
/// Timeout after which the search will return results. /// Timeout after which the search will return results.
#[structopt(long)] #[structopt(long)]
fetch_timeout_ms: Option<u64>, fetch_timeout_ms: Option<u64>,
@ -65,10 +69,21 @@ struct SearchCommand {
displayed_fields: Vec<String>, displayed_fields: Vec<String>,
} }
#[derive(Debug, StructOpt)]
struct ShowUpdatesCommand {
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_name: String,
}
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
enum Command { enum Command {
Index(IndexCommand), Index(IndexCommand),
Search(SearchCommand), Search(SearchCommand),
ShowUpdates(ShowUpdatesCommand),
} }
impl Command { impl Command {
@ -76,6 +91,7 @@ impl Command {
match self { match self {
Command::Index(command) => &command.database_path, Command::Index(command) => &command.database_path,
Command::Search(command) => &command.database_path, Command::Search(command) => &command.database_path,
Command::ShowUpdates(command) => &command.database_path,
} }
} }
} }
@ -88,13 +104,13 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
let start = Instant::now(); let start = Instant::now();
let (sender, receiver) = mpsc::sync_channel(100); let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap(); let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(INDEX_NAME) { let index = match database.open_index(&command.index_name) {
Some(index) => index, Some(index) => index,
None => database.create_index(INDEX_NAME).unwrap(), None => database.create_index(&command.index_name).unwrap(),
}; };
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn)); let done = database.set_update_callback(&command.index_name, Box::new(update_fn));
assert!(done, "could not set the index update function"); assert!(done, "could not set the index update function");
let env = &database.env; let env = &database.env;
@ -201,7 +217,11 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
_ => unreachable!(), _ => unreachable!(),
}; };
if highlighted { if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?; stdout.set_color(
ColorSpec::new()
.set_fg(Some(Color::Yellow))
.set_underline(true),
)?;
} }
write!(&mut stdout, "{}", &text[start..end])?; write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?; stdout.reset()?;
@ -297,12 +317,13 @@ fn crop_text(
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> { fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
let env = &database.env; let env = &database.env;
let index = database let index = database
.open_index(INDEX_NAME) .open_index(&command.index_name)
.expect("Could not find index"); .expect("Could not find index");
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
let schema = index.main.schema(&reader)?; let schema = index.main.schema(&reader)?;
reader.abort(); reader.abort();
let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?; let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?;
let fields = command.displayed_fields.iter().map(String::as_str); let fields = command.displayed_fields.iter().map(String::as_str);
@ -418,6 +439,23 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
Ok(()) Ok(())
} }
fn show_updates_command(
command: ShowUpdatesCommand,
database: Database,
) -> Result<(), Box<dyn Error>> {
let env = &database.env;
let index = database
.open_index(&command.index_name)
.expect("Could not find index");
let reader = env.read_txn().unwrap();
let updates = index.all_updates_status(&reader)?;
println!("{:#?}", updates);
reader.abort();
Ok(())
}
fn main() -> Result<(), Box<dyn Error>> { fn main() -> Result<(), Box<dyn Error>> {
env_logger::init(); env_logger::init();
@ -427,5 +465,6 @@ fn main() -> Result<(), Box<dyn Error>> {
match opt { match opt {
Command::Index(command) => index_command(command, database), Command::Index(command) => index_command(command, database),
Command::Search(command) => search_command(command, database), Command::Search(command) => search_command(command, database),
Command::ShowUpdates(command) => show_updates_command(command, database),
} }
} }

View File

@ -2,7 +2,7 @@ mod dfa;
mod query_enhancer; mod query_enhancer;
use std::cmp::Reverse; use std::cmp::Reverse;
use std::vec; use std::{cmp, vec};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA; use levenshtein_automata::DFA;
@ -18,27 +18,55 @@ use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3; const NGRAMS: usize = 3;
pub struct AutomatonProducer { pub struct AutomatonProducer {
automatons: Vec<Vec<Automaton>>, automatons: Vec<AutomatonGroup>,
} }
impl AutomatonProducer { impl AutomatonProducer {
pub fn new( pub fn new(
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
query: &str, query: &str,
main_store: store::Main, main_store: store::Main,
postings_list_store: store::PostingsLists,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)> { ) -> MResult<(AutomatonProducer, QueryEnhancer)> {
let (automatons, query_enhancer) = let (automatons, query_enhancer) = generate_automatons(
generate_automatons(reader, query, main_store, synonyms_store)?; reader,
query,
main_store,
postings_list_store,
synonyms_store,
)?;
Ok((AutomatonProducer { automatons }, query_enhancer)) Ok((AutomatonProducer { automatons }, query_enhancer))
} }
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> { pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
self.automatons.into_iter() self.automatons.into_iter()
} }
} }
#[derive(Debug)]
pub struct AutomatonGroup {
pub is_phrase_query: bool,
pub automatons: Vec<Automaton>,
}
impl AutomatonGroup {
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: false,
automatons,
}
}
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
AutomatonGroup {
is_phrase_query: true,
automatons,
}
}
}
#[derive(Debug)] #[derive(Debug)]
pub struct Automaton { pub struct Automaton {
pub index: usize, pub index: usize,
@ -102,12 +130,41 @@ pub fn normalize_str(string: &str) -> String {
string string
} }
fn split_best_frequency<'a>(
reader: &heed::RoTxn,
word: &'a str,
postings_lists_store: store::PostingsLists,
) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = postings_lists_store
.postings_list(reader, left.as_ref())?
.map_or(0, |i| i.len());
let right_freq = postings_lists_store
.postings_list(reader, right.as_ref())?
.map_or(0, |i| i.len());
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn generate_automatons( fn generate_automatons(
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
query: &str, query: &str,
main_store: store::Main, main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms, synonym_store: store::Synonyms,
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> { ) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? { let synonyms = match main_store.synonyms_fst(reader)? {
@ -136,7 +193,7 @@ fn generate_automatons(
original_automatons.push(automaton); original_automatons.push(automaton);
} }
automatons.push(original_automatons); automatons.push(AutomatonGroup::normal(original_automatons));
for n in 1..=NGRAMS { for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable(); let mut ngrams = query_words.windows(n).enumerate().peekable();
@ -188,13 +245,27 @@ fn generate_automatons(
Automaton::non_exact(automaton_index, n, synonym) Automaton::non_exact(automaton_index, n, synonym)
}; };
automaton_index += 1; automaton_index += 1;
automatons.push(vec![automaton]); automatons.push(AutomatonGroup::normal(vec![automaton]));
} }
} }
} }
} }
if n != 1 { if n == 1 {
if let Some((left, right)) =
split_best_frequency(reader, &normalized, postings_lists_store)?
{
let a = Automaton::exact(automaton_index, 1, left);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
let b = Automaton::exact(automaton_index, 1, right);
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
}
} else {
// automaton of concatenation of query words // automaton of concatenation of query words
let concat = ngram_slice.concat(); let concat = ngram_slice.concat();
let normalized = normalize_str(&concat); let normalized = normalize_str(&concat);
@ -204,16 +275,20 @@ fn generate_automatons(
let automaton = Automaton::exact(automaton_index, n, &normalized); let automaton = Automaton::exact(automaton_index, n, &normalized);
automaton_index += 1; automaton_index += 1;
automatons.push(vec![automaton]); automatons.push(AutomatonGroup::normal(vec![automaton]));
} }
} }
} }
// order automatons, the most important first, // order automatons, the most important first,
// we keep the original automatons at the front. // we keep the original automatons at the front.
automatons[1..].sort_by_key(|a| { automatons[1..].sort_by_key(|group| {
let a = a.first().unwrap(); let a = group.automatons.first().unwrap();
(Reverse(a.is_exact), a.ngram) (
Reverse(a.is_exact),
a.ngram,
Reverse(group.automatons.len()),
)
}); });
Ok((automatons, enhancer_builder.build())) Ok((automatons, enhancer_builder.build()))

View File

@ -21,16 +21,15 @@ fn number_exact_matches(
let len = group.len(); let len = group.len();
let mut found_exact = false; let mut found_exact = false;
for (pos, _) in is_exact[index..index + len] for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
.iter() if *is_exact {
.filter(|x| **x) found_exact = true;
.enumerate() let attr = &attribute[index + pos];
{ if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
found_exact = true; let (_, count) = fields_counts[pos];
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) { if count == 1 {
let (_, count) = fields_counts[pos]; return usize::max_value();
if count == 1 { }
return usize::max_value();
} }
} }
} }

View File

@ -4,75 +4,104 @@ use std::path::Path;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::{fs, thread}; use std::{fs, thread};
use crossbeam_channel::Receiver; use crossbeam_channel::{Receiver, Sender};
use log::{debug, error}; use heed::types::{Str, Unit};
use zlmdb::types::{Str, Unit}; use heed::{CompactionOption, Result as ZResult};
use zlmdb::{CompactionOption, Result as ZResult}; use log::debug;
use crate::{store, update, Index, MResult}; use crate::{store, update, Index, MResult};
pub type BoxUpdateFn = Box<dyn Fn(update::UpdateResult) + Send + Sync + 'static>; pub type BoxUpdateFn = Box<dyn Fn(update::ProcessedUpdateResult) + Send + Sync + 'static>;
type ArcSwapFn = arc_swap::ArcSwapOption<BoxUpdateFn>; type ArcSwapFn = arc_swap::ArcSwapOption<BoxUpdateFn>;
pub struct Database { pub struct Database {
pub env: zlmdb::Env, pub env: heed::Env,
common_store: zlmdb::DynDatabase, common_store: heed::PolyDatabase,
indexes_store: zlmdb::Database<Str, Unit>, indexes_store: heed::Database<Str, Unit>,
indexes: RwLock<HashMap<String, (Index, Arc<ArcSwapFn>, thread::JoinHandle<()>)>>, indexes: RwLock<HashMap<String, (Index, Arc<ArcSwapFn>, thread::JoinHandle<()>)>>,
} }
fn update_awaiter( macro_rules! r#break_try {
receiver: Receiver<()>, ($expr:expr, $msg:tt) => {
env: zlmdb::Env, match $expr {
update_fn: Arc<ArcSwapFn>, core::result::Result::Ok(val) => val,
index: Index, core::result::Result::Err(err) => {
) { log::error!(concat!($msg, ": {}"), err);
for () in receiver { break;
// consume all updates in order (oldest first) }
}
};
}
pub enum UpdateEvent {
NewUpdate,
MustStop,
}
pub type UpdateEvents = Receiver<UpdateEvent>;
pub type UpdateEventsEmitter = Sender<UpdateEvent>;
fn update_awaiter(receiver: UpdateEvents, env: heed::Env, update_fn: Arc<ArcSwapFn>, index: Index) {
let mut receiver = receiver.into_iter();
while let Some(UpdateEvent::NewUpdate) = receiver.next() {
loop { loop {
let mut writer = match env.write_txn() { // instantiate a main/parent transaction
Ok(writer) => writer, let mut writer = break_try!(env.write_txn(), "LMDB write transaction begin failed");
Err(e) => {
error!("LMDB writer transaction begin failed: {}", e);
break;
}
};
match update::update_task(&mut writer, index.clone()) { // retrieve the update that needs to be processed
Ok(Some(status)) => { let result = index.updates.pop_front(&mut writer);
if let Err(e) = writer.commit() { let (update_id, update) = match break_try!(result, "pop front update failed") {
error!("update transaction failed: {}", e) Some(value) => value,
} None => {
if let Some(ref callback) = *update_fn.load() {
(callback)(status);
}
}
// no more updates to handle for now
Ok(None) => {
debug!("no more updates"); debug!("no more updates");
writer.abort(); writer.abort();
break; break;
} }
Err(e) => { };
error!("update task failed: {}", e);
writer.abort() // instantiate a nested transaction
} let result = env.nested_write_txn(&mut writer);
let mut nested_writer = break_try!(result, "LMDB nested write transaction failed");
// try to apply the update to the database using the nested transaction
let result = update::update_task(&mut nested_writer, index.clone(), update_id, update);
let status = break_try!(result, "update task failed");
// commit the nested transaction if the update was successful, abort it otherwise
if status.result.is_ok() {
break_try!(nested_writer.commit(), "commit nested transaction failed");
} else {
nested_writer.abort()
}
// write the result of the update in the updates-results store
let updates_results = index.updates_results;
let result = updates_results.put_update_result(&mut writer, update_id, &status);
// always commit the main/parent transaction, even if the update was unsuccessful
break_try!(result, "update result store commit failed");
break_try!(writer.commit(), "update parent transaction failed");
// call the user callback when the update and the result are written consistently
if let Some(ref callback) = *update_fn.load() {
(callback)(status);
} }
} }
} }
debug!("update loop system stopped");
} }
impl Database { impl Database {
pub fn open_or_create(path: impl AsRef<Path>) -> MResult<Database> { pub fn open_or_create(path: impl AsRef<Path>) -> MResult<Database> {
fs::create_dir_all(path.as_ref())?; fs::create_dir_all(path.as_ref())?;
let env = zlmdb::EnvOpenOptions::new() let env = heed::EnvOpenOptions::new()
.map_size(10 * 1024 * 1024 * 1024) // 10GB .map_size(10 * 1024 * 1024 * 1024) // 10GB
.max_dbs(3000) .max_dbs(3000)
.open(path)?; .open(path)?;
let common_store = env.create_dyn_database(Some("common"))?; let common_store = env.create_poly_database(Some("common"))?;
let indexes_store = env.create_database::<Str, Unit>(Some("indexes"))?; let indexes_store = env.create_database::<Str, Unit>(Some("indexes"))?;
// list all indexes that needs to be opened // list all indexes that needs to be opened
@ -111,7 +140,7 @@ impl Database {
// send an update notification to make sure that // send an update notification to make sure that
// possible pre-boot updates are consumed // possible pre-boot updates are consumed
sender.send(()).unwrap(); sender.send(UpdateEvent::NewUpdate).unwrap();
let result = indexes.insert(index_name, (index, update_fn, handle)); let result = indexes.insert(index_name, (index, update_fn, handle));
assert!( assert!(
@ -167,6 +196,28 @@ impl Database {
} }
} }
pub fn delete_index(&self, name: impl AsRef<str>) -> MResult<bool> {
let name = name.as_ref();
let mut indexes_lock = self.indexes.write().unwrap();
match indexes_lock.remove_entry(name) {
Some((name, (index, _fn, handle))) => {
// remove the index name from the list of indexes
// and clear all the LMDB dbi
let mut writer = self.env.write_txn()?;
self.indexes_store.delete(&mut writer, &name)?;
store::clear(&mut writer, &index)?;
writer.commit()?;
// join the update loop thread to ensure it is stopped
handle.join().unwrap();
Ok(true)
}
None => Ok(false),
}
}
pub fn set_update_callback(&self, name: impl AsRef<str>, update_fn: BoxUpdateFn) -> bool { pub fn set_update_callback(&self, name: impl AsRef<str>, update_fn: BoxUpdateFn) -> bool {
let indexes_lock = self.indexes.read().unwrap(); let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) { match indexes_lock.get(name.as_ref()) {
@ -199,7 +250,533 @@ impl Database {
Ok(indexes.keys().cloned().collect()) Ok(indexes.keys().cloned().collect())
} }
pub fn common_store(&self) -> zlmdb::DynDatabase { pub fn common_store(&self) -> heed::PolyDatabase {
self.common_store self.common_store
} }
} }
#[cfg(test)]
mod tests {
use super::*;
use crate::update::{ProcessedUpdateResult, UpdateStatus};
use crate::DocumentId;
use serde::de::IgnoredAny;
use std::sync::mpsc;
#[test]
fn valid_updates() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
}
#[test]
fn invalid_updates() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
let doc2 = serde_json::json!({
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
}
#[test]
fn add_schema_attributes_at_end() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let _update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
[attributes."age"]
displayed = true
indexed = true
[attributes."sex"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
reader.abort();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
"age": 21,
"sex": "Male",
});
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
"age": 23,
"sex": "Male",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
// even try to search for a document
let results = index.query_builder().query(&reader, "21 ", 0..20).unwrap();
assert_matches!(results.len(), 1);
reader.abort();
// try to introduce attributes in the middle of the schema
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
[attributes."city"]
displayed = true
indexed = true
[attributes."age"]
displayed = true
indexed = true
[attributes."sex"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
}
#[test]
fn deserialize_documents() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
// DocumentId(7900334843754999545)
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
// DocumentId(8367468610878465872)
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
let document: Option<IgnoredAny> = index.document(&reader, None, DocumentId(25)).unwrap();
assert!(document.is_none());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(7900334843754999545))
.unwrap();
assert!(document.is_some());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(8367468610878465872))
.unwrap();
assert!(document.is_some());
}
#[test]
fn partial_document_update() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
let schema = {
let data = r#"
identifier = "id"
[attributes."id"]
displayed = true
[attributes."name"]
displayed = true
indexed = true
[attributes."description"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
// DocumentId(7900334843754999545)
let doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "My name is Marvin",
});
// DocumentId(8367468610878465872)
let doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "My name is Kevin",
});
additions.update_document(doc1);
additions.update_document(doc2);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
let document: Option<IgnoredAny> = index.document(&reader, None, DocumentId(25)).unwrap();
assert!(document.is_none());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(7900334843754999545))
.unwrap();
assert!(document.is_some());
let document: Option<IgnoredAny> = index
.document(&reader, None, DocumentId(8367468610878465872))
.unwrap();
assert!(document.is_some());
reader.abort();
let mut partial_additions = index.documents_partial_addition();
// DocumentId(7900334843754999545)
let partial_doc1 = serde_json::json!({
"id": 123,
"description": "I am the new Marvin",
});
// DocumentId(8367468610878465872)
let partial_doc2 = serde_json::json!({
"id": 234,
"description": "I am the new Kevin",
});
partial_additions.update_document(partial_doc1);
partial_additions.update_document(partial_doc2);
let mut writer = env.write_txn().unwrap();
let update_id = partial_additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
let document: Option<serde_json::Value> = index
.document(&reader, None, DocumentId(7900334843754999545))
.unwrap();
let new_doc1 = serde_json::json!({
"id": 123,
"name": "Marvin",
"description": "I am the new Marvin",
});
assert_eq!(document, Some(new_doc1));
let document: Option<serde_json::Value> = index
.document(&reader, None, DocumentId(8367468610878465872))
.unwrap();
let new_doc2 = serde_json::json!({
"id": 234,
"name": "Kevin",
"description": "I am the new Kevin",
});
assert_eq!(document, Some(new_doc2));
}
#[test]
fn delete_index() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let _index = database.create_index("test").unwrap();
let deleted = database.delete_index("test").unwrap();
assert!(deleted);
let result = database.open_index("test");
assert!(result.is_none());
}
}

View File

@ -12,7 +12,7 @@ pub enum Error {
SchemaMissing, SchemaMissing,
WordIndexMissing, WordIndexMissing,
MissingDocumentId, MissingDocumentId,
Zlmdb(zlmdb::Error), Zlmdb(heed::Error),
Fst(fst::Error), Fst(fst::Error),
SerdeJson(SerdeJsonError), SerdeJson(SerdeJsonError),
Bincode(bincode::Error), Bincode(bincode::Error),
@ -27,8 +27,8 @@ impl From<io::Error> for Error {
} }
} }
impl From<zlmdb::Error> for Error { impl From<heed::Error> for Error {
fn from(error: zlmdb::Error) -> Error { fn from(error: heed::Error) -> Error {
Error::Zlmdb(error) Error::Zlmdb(error)
} }
} }
@ -79,7 +79,7 @@ impl fmt::Display for Error {
SchemaMissing => write!(f, "this index does not have a schema"), SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"), WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"), MissingDocumentId => write!(f, "document id is missing"),
Zlmdb(e) => write!(f, "zlmdb error; {}", e), Zlmdb(e) => write!(f, "heed error; {}", e),
Fst(e) => write!(f, "fst error; {}", e), Fst(e) => write!(f, "fst error; {}", e),
SerdeJson(e) => write!(f, "serde json error; {}", e), SerdeJson(e) => write!(f, "serde json error; {}", e),
Bincode(e) => write!(f, "bincode error; {}", e), Bincode(e) => write!(f, "bincode error; {}", e),
@ -95,6 +95,10 @@ impl error::Error for Error {}
#[derive(Debug)] #[derive(Debug)]
pub enum UnsupportedOperation { pub enum UnsupportedOperation {
SchemaAlreadyExists, SchemaAlreadyExists,
CannotUpdateSchemaIdentifier,
CannotReorderSchemaAttribute,
CanOnlyIntroduceNewSchemaAttributesAtEnd,
CannotRemoveSchemaAttribute,
} }
impl fmt::Display for UnsupportedOperation { impl fmt::Display for UnsupportedOperation {
@ -102,6 +106,12 @@ impl fmt::Display for UnsupportedOperation {
use self::UnsupportedOperation::*; use self::UnsupportedOperation::*;
match self { match self {
SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"), SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"),
CannotUpdateSchemaIdentifier => write!(f, "Cannot update the identifier of a schema"),
CannotReorderSchemaAttribute => write!(f, "Cannot reorder the attributes of a schema"),
CanOnlyIntroduceNewSchemaAttributesAtEnd => {
write!(f, "Can only introduce new attributes at end of a schema")
}
CannotRemoveSchemaAttribute => write!(f, "Cannot remove attributes from a schema"),
} }
} }
} }

View File

@ -0,0 +1,134 @@
use std::cmp::min;
use std::collections::BTreeMap;
use std::ops::{Index, IndexMut};
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array {
y_size: y,
buf: vec![value; x * y],
}
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
pub fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
let (n, m) = (source.len(), target.len());
assert!(
n <= m,
"the source string must be shorter than the target one"
);
if n == 0 {
return (m as u32, 0);
}
if m == 0 {
return (n as u32, 0);
}
if n == m && source == target {
return (0, m);
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
for i in 0..n + 1 {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
for j in 0..m + 1 {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.iter().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.iter().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = if char_s == char_t { 0 } else { 1 };
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
let mut minimum = (u32::max_value(), 0);
for x in n..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x)
}
}
minimum
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matched_length() {
let query = "Levenste";
let text = "Levenshtein";
let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
assert_eq!(dist, 1);
assert_eq!(&text[..length], "Levenshte");
}
#[test]
#[should_panic]
fn matched_length_panic() {
let query = "Levenshtein";
let text = "Levenste";
// this function will panic if source if longer than target
prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
}
}

View File

@ -7,6 +7,7 @@ pub mod criterion;
mod database; mod database;
mod distinct_map; mod distinct_map;
mod error; mod error;
mod levenshtein;
mod number; mod number;
mod query_builder; mod query_builder;
mod ranked_map; mod ranked_map;
@ -23,7 +24,7 @@ pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap; pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument; pub use self::raw_document::RawDocument;
pub use self::store::Index; pub use self::store::Index;
pub use self::update::{UpdateResult, UpdateStatus, UpdateType}; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
use ::serde::{Deserialize, Serialize}; use ::serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes}; use zerocopy::{AsBytes, FromBytes};

View File

@ -1,15 +1,17 @@
use hashbrown::HashMap; use hashbrown::HashMap;
use std::mem; use std::convert::TryFrom;
use std::ops::Range; use std::ops::Range;
use std::rc::Rc; use std::rc::Rc;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use std::{cmp, mem};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use sdset::SetBuf; use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut}; use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer}; use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::raw_document::{raw_documents_from, RawDocument}; use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch}; use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
@ -137,8 +139,8 @@ fn multiword_rewrite_matches(
} }
fn fetch_raw_documents( fn fetch_raw_documents(
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
automatons: &[Automaton], automatons_groups: &[AutomatonGroup],
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
searchables: Option<&ReorderedAttrs>, searchables: Option<&ReorderedAttrs>,
main_store: store::Main, main_store: store::Main,
@ -148,55 +150,104 @@ fn fetch_raw_documents(
let mut matches = Vec::new(); let mut matches = Vec::new();
let mut highlights = Vec::new(); let mut highlights = Vec::new();
for automaton in automatons { for group in automatons_groups {
let Automaton { let AutomatonGroup {
index, is_phrase_query,
is_exact, automatons,
query_len, } = group;
.. let phrase_query_len = automatons.len();
} = automaton;
let dfa = automaton.dfa();
let words = match main_store.words_fst(reader)? { let mut tmp_matches = Vec::new();
Some(words) => words, for (id, automaton) in automatons.into_iter().enumerate() {
None => return Ok(Vec::new()), let Automaton {
}; index,
is_exact,
query_len,
query,
..
} = automaton;
let dfa = automaton.dfa();
let mut stream = words.search(&dfa).into_stream(); let words = match main_store.words_fst(reader)? {
while let Some(input) = stream.next() { Some(words) => words,
let distance = dfa.eval(input).to_u8(); None => return Ok(Vec::new()),
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
}; };
matches.reserve(doc_indexes.len()); let mut stream = words.search(&dfa).into_stream();
highlights.reserve(doc_indexes.len()); while let Some(input) = stream.next() {
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
for di in doc_indexes.as_ref() { let covered_area = if *query_len > input.len() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); input.len()
if let Some(attribute) = attribute { } else {
let match_ = TmpMatch { prefix_damerau_levenshtein(query.as_bytes(), input).1
query_index: *index as u32, };
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let highlight = Highlight { let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
attribute: di.attribute, Some(doc_indexes) => doc_indexes,
char_index: di.char_index, None => continue,
char_length: di.char_length, };
};
matches.push((di.document_id, match_)); tmp_matches.reserve(doc_indexes.len());
highlights.push((di.document_id, highlight));
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value());
let covered_area = cmp::min(covered_area, di.char_length);
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: covered_area,
};
tmp_matches.push((di.document_id, id, match_, highlight));
}
} }
} }
} }
if *is_phrase_query {
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
for window in group.windows(2) {
let (ida, ia, ma, ha) = window[0];
let (idb, ib, mb, hb) = window[1];
debug_assert_eq!(ida, idb);
// if matches must follow and actually follows themselves
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
// TODO we must make it work for phrase query longer than 2
// if the second match is the last phrase query word
if ib + 1 == phrase_query_len {
// insert first match
matches.push((ida, ma));
highlights.push((ida, ha));
// insert second match
matches.push((idb, mb));
highlights.push((idb, hb));
}
}
}
}
} else {
for (id, _, match_, highlight) in tmp_matches {
matches.push((id, match_));
highlights.push((id, highlight));
}
}
} }
let matches = multiword_rewrite_matches(matches, &query_enhancer); let matches = multiword_rewrite_matches(matches, &query_enhancer);
@ -285,7 +336,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn query( pub fn query(
self, self,
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
query: &str, query: &str,
range: Range<usize>, range: Range<usize>,
) -> MResult<Vec<Document>> { ) -> MResult<Vec<Document>> {
@ -323,7 +374,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
} }
fn raw_query<'c, FI>( fn raw_query<'c, FI>(
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
query: &str, query: &str,
range: Range<usize>, range: Range<usize>,
@ -367,15 +418,20 @@ where
let start_processing = Instant::now(); let start_processing = Instant::now();
let mut raw_documents_processed = Vec::with_capacity(range.len()); let mut raw_documents_processed = Vec::with_capacity(range.len());
let (automaton_producer, query_enhancer) = let (automaton_producer, query_enhancer) = AutomatonProducer::new(
AutomatonProducer::new(reader, query, main_store, synonyms_store)?; reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter(); let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
// aggregate automatons groups by groups after time // aggregate automatons groups by groups after time
for auts in automaton_producer { for auts in automaton_producer {
automatons.extend(auts); automatons.push(auts);
// we must retrieve the documents associated // we must retrieve the documents associated
// with the current automatons // with the current automatons
@ -454,7 +510,7 @@ where
} }
fn raw_query_with_distinct<'c, FI, FD>( fn raw_query_with_distinct<'c, FI, FD>(
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
query: &str, query: &str,
range: Range<usize>, range: Range<usize>,
@ -480,15 +536,20 @@ where
let start_processing = Instant::now(); let start_processing = Instant::now();
let mut raw_documents_processed = Vec::new(); let mut raw_documents_processed = Vec::new();
let (automaton_producer, query_enhancer) = let (automaton_producer, query_enhancer) = AutomatonProducer::new(
AutomatonProducer::new(reader, query, main_store, synonyms_store)?; reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter(); let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
// aggregate automatons groups by groups after time // aggregate automatons groups by groups after time
for auts in automaton_producer { for auts in automaton_producer {
automatons.extend(auts); automatons.push(auts);
// we must retrieve the documents associated // we must retrieve the documents associated
// with the current automatons // with the current automatons
@ -1697,4 +1758,68 @@ mod tests {
}); });
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
} }
#[test]
fn simple_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("engine", &[doc_index(0, 1)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("engine", &[doc_index(1, 2)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
#[test]
fn harder_phrase_query_splitting() {
let store = TempDatabase::from_iter(vec![
("search", &[doc_index(0, 0)][..]),
("search", &[doc_index(0, 1)][..]),
("engine", &[doc_index(0, 2)][..]),
("search", &[doc_index(1, 0)][..]),
("slow", &[doc_index(1, 1)][..]),
("search", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
("search", &[doc_index(1, 0)][..]),
("search", &[doc_index(1, 1)][..]),
("slow", &[doc_index(1, 2)][..]),
("engine", &[doc_index(1, 3)][..]),
]);
let env = &store.database.env;
let reader = env.read_txn().unwrap();
let builder = store.query_builder();
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
} }

View File

@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer { pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>, docs_words: HashMap<DocumentId, Vec<Word>>,
} }
@ -21,13 +22,14 @@ pub struct Indexed {
} }
impl RawIndexer { impl RawIndexer {
pub fn new() -> RawIndexer { pub fn new(stop_words: fst::Set) -> RawIndexer {
RawIndexer::with_word_limit(1000) RawIndexer::with_word_limit(stop_words, 1000)
} }
pub fn with_word_limit(limit: usize) -> RawIndexer { pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
RawIndexer { RawIndexer {
word_limit: limit, word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(), words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(), docs_words: HashMap::new(),
} }
@ -35,89 +37,40 @@ impl RawIndexer {
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize { pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
let mut number_of_words = 0; let mut number_of_words = 0;
let lowercase_text = text.to_lowercase();
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
// TODO compute the deunicoded version after the cjk check for token in Tokenizer::new(text) {
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
Some(deunicoded)
} else {
None
};
let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter {
// we must not count 2 times the same words
number_of_words = 0;
for token in Tokenizer::new(&text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
number_of_words += 1;
}
}
number_of_words
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where
I: IntoIterator<Item = &'a str, IntoIter = IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
let iter = lowercased.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token( let must_continue = index_token(
token, token,
id, id,
attr, attr,
self.word_limit, self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes, &mut self.words_doc_indexes,
&mut self.docs_words, &mut self.docs_words,
); );
number_of_words += 1;
if !must_continue { if !must_continue {
break; break;
} }
} }
let deunicoded: Vec<_> = lowercased number_of_words
.into_iter() }
.map(|lowercase_text| {
if lowercase_text.contains(is_cjk) {
return lowercase_text;
}
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded {
deunicoded
} else {
lowercase_text
}
})
.collect();
let iter = deunicoded.iter().map(|t| t.as_str());
pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where
I: IntoIterator<Item = &'a str>,
{
let iter = iter.into_iter();
for token in SeqTokenizer::new(iter) { for token in SeqTokenizer::new(iter) {
let must_continue = index_token( let must_continue = index_token(
token, token,
id, id,
attr, attr,
self.word_limit, self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes, &mut self.words_doc_indexes,
&mut self.docs_words, &mut self.docs_words,
); );
@ -152,17 +105,12 @@ impl RawIndexer {
} }
} }
impl Default for RawIndexer {
fn default() -> Self {
Self::new()
}
}
fn index_token( fn index_token(
token: Token, token: Token,
id: DocumentId, id: DocumentId,
attr: SchemaAttr, attr: SchemaAttr,
word_limit: usize, word_limit: usize,
stop_words: &fst::Set,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>, docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool { ) -> bool {
@ -170,16 +118,36 @@ fn index_token(
return false; return false;
} }
match token_to_docindex(id, attr, token) { let lower = token.word.to_lowercase();
Some(docindex) => { let token = Token {
let word = Vec::from(token.word); word: &lower,
words_doc_indexes ..token
.entry(word.clone()) };
.or_insert_with(Vec::new)
.push(docindex); if !stop_words.contains(&token.word) {
docs_words.entry(id).or_insert_with(Vec::new).push(word); match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
if !lower.contains(is_cjk) {
let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower && !unidecoded.is_empty() {
let word = Vec::from(unidecoded);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
}
}
None => return false,
} }
None => return false,
} }
true true
@ -207,7 +175,7 @@ mod tests {
#[test] #[test]
fn strange_apostrophe() { fn strange_apostrophe() {
let mut indexer = RawIndexer::new(); let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0); let docid = DocumentId(0);
let attr = SchemaAttr(0); let attr = SchemaAttr(0);
@ -222,16 +190,14 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes()) .get(&"éteindre".to_owned().into_bytes())
.is_some()); .is_some());
} }
#[test] #[test]
fn strange_apostrophe_in_sequence() { fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new(); let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0); let docid = DocumentId(0);
let attr = SchemaAttr(0); let attr = SchemaAttr(0);
@ -246,10 +212,53 @@ mod tests {
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some()); assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes()) .get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn basic_stop_words() {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(stop_words);
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn no_empty_unidecode() {
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "🇯🇵";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes
.get(&"🇯🇵".to_owned().into_bytes())
.is_some()); .is_some());
} }
} }

View File

@ -12,8 +12,8 @@ impl ser::Serializer for ConvertToString {
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>; type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>; type SerializeMap = MapConvertToString;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeStruct = StructConvertToString;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> { fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
@ -169,7 +169,9 @@ impl ser::Serializer for ConvertToString {
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "map" }) Ok(MapConvertToString {
text: String::new(),
})
} }
fn serialize_struct( fn serialize_struct(
@ -177,8 +179,8 @@ impl ser::Serializer for ConvertToString {
_name: &'static str, _name: &'static str,
_len: usize, _len: usize,
) -> Result<Self::SerializeStruct, Self::Error> { ) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnserializableType { Ok(StructConvertToString {
type_name: "struct", text: String::new(),
}) })
} }
@ -194,3 +196,63 @@ impl ser::Serializer for ConvertToString {
}) })
} }
} }
pub struct MapConvertToString {
text: String,
}
impl ser::SerializeMap for MapConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.text.push_str(&text);
self.text.push_str(" ");
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.text.push_str(&text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}
pub struct StructConvertToString {
text: String,
}
impl ser::SerializeStruct for StructConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let value = value.serialize(ConvertToString)?;
self.text.push_str(key);
self.text.push_str(" ");
self.text.push_str(&value);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}

View File

@ -14,7 +14,7 @@ use crate::DocumentId;
#[derive(Debug)] #[derive(Debug)]
pub enum DeserializerError { pub enum DeserializerError {
SerdeJson(SerdeJsonError), SerdeJson(SerdeJsonError),
Zlmdb(zlmdb::Error), Zlmdb(heed::Error),
Custom(String), Custom(String),
} }
@ -28,7 +28,7 @@ impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self { match self {
DeserializerError::SerdeJson(e) => write!(f, "serde json related error: {}", e), DeserializerError::SerdeJson(e) => write!(f, "serde json related error: {}", e),
DeserializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e), DeserializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
DeserializerError::Custom(s) => f.write_str(s), DeserializerError::Custom(s) => f.write_str(s),
} }
} }
@ -42,15 +42,15 @@ impl From<SerdeJsonError> for DeserializerError {
} }
} }
impl From<zlmdb::Error> for DeserializerError { impl From<heed::Error> for DeserializerError {
fn from(error: zlmdb::Error) -> DeserializerError { fn from(error: heed::Error) -> DeserializerError {
DeserializerError::Zlmdb(error) DeserializerError::Zlmdb(error)
} }
} }
pub struct Deserializer<'a> { pub struct Deserializer<'a> {
pub document_id: DocumentId, pub document_id: DocumentId,
pub reader: &'a zlmdb::RoTxn, pub reader: &'a heed::RoTxn,
pub documents_fields: DocumentsFields, pub documents_fields: DocumentsFields,
pub schema: &'a Schema, pub schema: &'a Schema,
pub attributes: Option<&'a HashSet<SchemaAttr>>, pub attributes: Option<&'a HashSet<SchemaAttr>>,
@ -63,13 +63,14 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
where where
V: de::Visitor<'de>, V: de::Visitor<'de>,
{ {
self.deserialize_map(visitor) self.deserialize_option(visitor)
} }
forward_to_deserialize_any! { fn deserialize_option<V>(self, visitor: V) -> Result<V::Value, Self::Error>
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string where
bytes byte_buf option unit unit_struct newtype_struct seq tuple V: de::Visitor<'de>,
tuple_struct struct enum identifier ignored_any {
self.deserialize_map(visitor)
} }
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error> fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
@ -104,16 +105,29 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
} }
}); });
let map_deserializer = de::value::MapDeserializer::new(iter); let mut iter = iter.peekable();
let result = visitor
.visit_map(map_deserializer) let result = match iter.peek() {
.map_err(DeserializerError::from); Some(_) => {
let map_deserializer = de::value::MapDeserializer::new(iter);
visitor
.visit_some(map_deserializer)
.map_err(DeserializerError::from)
}
None => visitor.visit_none(),
};
match error.take() { match error.take() {
Some(error) => Err(error.into()), Some(error) => Err(error.into()),
None => result, None => result,
} }
} }
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
}
} }
struct Value(SerdeJsonDeserializer<SerdeJsonIoRead<Cursor<Vec<u8>>>>); struct Value(SerdeJsonDeserializer<SerdeJsonIoRead<Cursor<Vec<u8>>>>);

View File

@ -20,7 +20,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>; type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>; type SerializeStruct = StructIndexer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> { fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
@ -302,14 +302,14 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
} }
} }
pub struct StructSerializer<'a> { pub struct StructIndexer<'a> {
attribute: SchemaAttr, attribute: SchemaAttr,
document_id: DocumentId, document_id: DocumentId,
indexer: &'a mut RawIndexer, indexer: &'a mut RawIndexer,
texts: Vec<String>, texts: Vec<String>,
} }
impl<'a> ser::SerializeStruct for StructSerializer<'a> { impl<'a> ser::SerializeStruct for StructIndexer<'a> {
type Ok = Option<usize>; type Ok = Option<usize>;
type Error = SerializerError; type Error = SerializerError;

View File

@ -20,22 +20,20 @@ pub use self::convert_to_string::ConvertToString;
pub use self::deserializer::{Deserializer, DeserializerError}; pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
pub use self::indexer::Indexer; pub use self::indexer::Indexer;
pub use self::serializer::Serializer; pub use self::serializer::{serialize_value, Serializer};
use std::collections::BTreeMap;
use std::{error::Error, fmt}; use std::{error::Error, fmt};
use meilidb_schema::SchemaAttr;
use serde::ser; use serde::ser;
use serde_json::Error as SerdeJsonError; use serde_json::Error as SerdeJsonError;
use crate::{DocumentId, ParseNumberError}; use crate::ParseNumberError;
#[derive(Debug)] #[derive(Debug)]
pub enum SerializerError { pub enum SerializerError {
DocumentIdNotFound, DocumentIdNotFound,
InvalidDocumentIdType, InvalidDocumentIdType,
Zlmdb(zlmdb::Error), Zlmdb(heed::Error),
SerdeJson(SerdeJsonError), SerdeJson(SerdeJsonError),
ParseNumber(ParseNumberError), ParseNumber(ParseNumberError),
UnserializableType { type_name: &'static str }, UnserializableType { type_name: &'static str },
@ -59,7 +57,7 @@ impl fmt::Display for SerializerError {
SerializerError::InvalidDocumentIdType => { SerializerError::InvalidDocumentIdType => {
f.write_str("document identifier can only be of type string or number") f.write_str("document identifier can only be of type string or number")
} }
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e), SerializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e), SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumber(e) => { SerializerError::ParseNumber(e) => {
write!(f, "error while trying to parse a number: {}", e) write!(f, "error while trying to parse a number: {}", e)
@ -92,8 +90,8 @@ impl From<SerdeJsonError> for SerializerError {
} }
} }
impl From<zlmdb::Error> for SerializerError { impl From<heed::Error> for SerializerError {
fn from(error: zlmdb::Error) -> SerializerError { fn from(error: heed::Error) -> SerializerError {
SerializerError::Zlmdb(error) SerializerError::Zlmdb(error)
} }
} }
@ -103,25 +101,3 @@ impl From<ParseNumberError> for SerializerError {
SerializerError::ParseNumber(error) SerializerError::ParseNumber(error)
} }
} }
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
impl RamDocumentStore {
pub fn new() -> RamDocumentStore {
RamDocumentStore(BTreeMap::new())
}
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
self.0.insert((id, attr), value);
}
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
self.0
}
}
impl Default for RamDocumentStore {
fn default() -> Self {
Self::new()
}
}

View File

@ -1,31 +1,31 @@
use meilidb_schema::{Schema, SchemaAttr}; use meilidb_schema::{Schema, SchemaAttr, SchemaProps};
use serde::ser; use serde::ser;
use std::collections::HashMap;
use crate::raw_indexer::RawIndexer; use crate::raw_indexer::RawIndexer;
use crate::serde::RamDocumentStore; use crate::store::{DocumentsFields, DocumentsFieldsCounts};
use crate::{DocumentId, RankedMap}; use crate::{DocumentId, RankedMap};
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError}; use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
pub struct Serializer<'a> { pub struct Serializer<'a, 'b> {
pub txn: &'a mut heed::RwTxn<'b>,
pub schema: &'a Schema, pub schema: &'a Schema,
pub document_store: &'a mut RamDocumentStore, pub document_store: DocumentsFields,
pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, pub document_fields_counts: DocumentsFieldsCounts,
pub indexer: &'a mut RawIndexer, pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap, pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId, pub document_id: DocumentId,
} }
impl<'a> ser::Serializer for Serializer<'a> { impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> {
type Ok = (); type Ok = ();
type Error = SerializerError; type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>; type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a>; type SerializeMap = MapSerializer<'a, 'b>;
type SerializeStruct = StructSerializer<'a>; type SerializeStruct = StructSerializer<'a, 'b>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! { forward_to_unserializable_type! {
@ -150,6 +150,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer { Ok(MapSerializer {
txn: self.txn,
schema: self.schema, schema: self.schema,
document_id: self.document_id, document_id: self.document_id,
document_store: self.document_store, document_store: self.document_store,
@ -166,6 +167,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
_len: usize, _len: usize,
) -> Result<Self::SerializeStruct, Self::Error> { ) -> Result<Self::SerializeStruct, Self::Error> {
Ok(StructSerializer { Ok(StructSerializer {
txn: self.txn,
schema: self.schema, schema: self.schema,
document_id: self.document_id, document_id: self.document_id,
document_store: self.document_store, document_store: self.document_store,
@ -188,17 +190,18 @@ impl<'a> ser::Serializer for Serializer<'a> {
} }
} }
pub struct MapSerializer<'a> { pub struct MapSerializer<'a, 'b> {
txn: &'a mut heed::RwTxn<'b>,
schema: &'a Schema, schema: &'a Schema,
document_id: DocumentId, document_id: DocumentId,
document_store: &'a mut RamDocumentStore, document_store: DocumentsFields,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer, indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap, ranked_map: &'a mut RankedMap,
current_key_name: Option<String>, current_key_name: Option<String>,
} }
impl<'a> ser::SerializeMap for MapSerializer<'a> { impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> {
type Ok = (); type Ok = ();
type Error = SerializerError; type Error = SerializerError;
@ -229,17 +232,20 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
V: ser::Serialize, V: ser::Serialize,
{ {
let key = key.serialize(ConvertToString)?; let key = key.serialize(ConvertToString)?;
match self.schema.attribute(&key) {
serialize_value( Some(attribute) => serialize_value(
self.schema, self.txn,
self.document_id, attribute,
self.document_store, self.schema.props(attribute),
self.document_fields_counts, self.document_id,
self.indexer, self.document_store,
self.ranked_map, self.document_fields_counts,
&key, self.indexer,
value, self.ranked_map,
) value,
),
None => Ok(()),
}
} }
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
@ -247,16 +253,17 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
} }
} }
pub struct StructSerializer<'a> { pub struct StructSerializer<'a, 'b> {
txn: &'a mut heed::RwTxn<'b>,
schema: &'a Schema, schema: &'a Schema,
document_id: DocumentId, document_id: DocumentId,
document_store: &'a mut RamDocumentStore, document_store: DocumentsFields,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer, indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap, ranked_map: &'a mut RankedMap,
} }
impl<'a> ser::SerializeStruct for StructSerializer<'a> { impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> {
type Ok = (); type Ok = ();
type Error = SerializerError; type Error = SerializerError;
@ -268,16 +275,20 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
where where
T: ser::Serialize, T: ser::Serialize,
{ {
serialize_value( match self.schema.attribute(key) {
self.schema, Some(attribute) => serialize_value(
self.document_id, self.txn,
self.document_store, attribute,
self.document_fields_counts, self.schema.props(attribute),
self.indexer, self.document_id,
self.ranked_map, self.document_store,
key, self.document_fields_counts,
value, self.indexer,
) self.ranked_map,
value,
),
None => Ok(()),
}
} }
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
@ -285,40 +296,42 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
} }
} }
fn serialize_value<T: ?Sized>( pub fn serialize_value<T: ?Sized>(
schema: &Schema, txn: &mut heed::RwTxn,
attribute: SchemaAttr,
props: SchemaProps,
document_id: DocumentId, document_id: DocumentId,
document_store: &mut RamDocumentStore, document_store: DocumentsFields,
documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>, documents_fields_counts: DocumentsFieldsCounts,
indexer: &mut RawIndexer, indexer: &mut RawIndexer,
ranked_map: &mut RankedMap, ranked_map: &mut RankedMap,
key: &str,
value: &T, value: &T,
) -> Result<(), SerializerError> ) -> Result<(), SerializerError>
where where
T: ser::Serialize, T: ser::Serialize,
{ {
if let Some(attribute) = schema.attribute(key) { let serialized = serde_json::to_vec(value)?;
let props = schema.props(attribute); document_store.put_document_field(txn, document_id, attribute, &serialized)?;
let serialized = serde_json::to_vec(value)?; if props.is_indexed() {
document_store.set_document_field(document_id, attribute, serialized); let indexer = Indexer {
attribute,
if props.is_indexed() { indexer,
let indexer = Indexer { document_id,
attribute, };
indexer, if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.put_document_field_count(
txn,
document_id, document_id,
}; attribute,
if let Some(number_of_words) = value.serialize(indexer)? { number_of_words as u64,
documents_fields_counts.insert((document_id, attribute), number_of_words as u64); )?;
}
} }
}
if props.is_ranked() { if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?; let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number); ranked_map.insert(document_id, attribute, number);
}
} }
Ok(()) Ok(())

View File

@ -1,18 +1,18 @@
use super::BEU64; use super::BEU64;
use crate::DocumentId; use crate::DocumentId;
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
use std::sync::Arc; use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocsWords { pub struct DocsWords {
pub(crate) docs_words: zlmdb::Database<OwnedType<BEU64>, ByteSlice>, pub(crate) docs_words: heed::Database<OwnedType<BEU64>, ByteSlice>,
} }
impl DocsWords { impl DocsWords {
pub fn put_doc_words( pub fn put_doc_words(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
document_id: DocumentId, document_id: DocumentId,
words: &fst::Set, words: &fst::Set,
) -> ZResult<()> { ) -> ZResult<()> {
@ -21,18 +21,18 @@ impl DocsWords {
self.docs_words.put(writer, &document_id, bytes) self.docs_words.put(writer, &document_id, bytes)
} }
pub fn del_doc_words( pub fn del_doc_words(self, writer: &mut heed::RwTxn, document_id: DocumentId) -> ZResult<bool> {
self,
writer: &mut zlmdb::RwTxn,
document_id: DocumentId,
) -> ZResult<bool> {
let document_id = BEU64::new(document_id.0); let document_id = BEU64::new(document_id.0);
self.docs_words.delete(writer, &document_id) self.docs_words.delete(writer, &document_id)
} }
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.docs_words.clear(writer)
}
pub fn doc_words( pub fn doc_words(
self, self,
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<Option<fst::Set>> { ) -> ZResult<Option<fst::Set>> {
let document_id = BEU64::new(document_id.0); let document_id = BEU64::new(document_id.0);

View File

@ -1,19 +1,19 @@
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
use super::DocumentAttrKey; use super::DocumentAttrKey;
use crate::DocumentId; use crate::DocumentId;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocumentsFields { pub struct DocumentsFields {
pub(crate) documents_fields: zlmdb::Database<OwnedType<DocumentAttrKey>, ByteSlice>, pub(crate) documents_fields: heed::Database<OwnedType<DocumentAttrKey>, ByteSlice>,
} }
impl DocumentsFields { impl DocumentsFields {
pub fn put_document_field( pub fn put_document_field(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
value: &[u8], value: &[u8],
@ -24,17 +24,21 @@ impl DocumentsFields {
pub fn del_all_document_fields( pub fn del_all_document_fields(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<usize> { ) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields.delete_range(writer, start..=end) self.documents_fields.delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.documents_fields.clear(writer)
} }
pub fn document_attribute<'txn>( pub fn document_attribute<'txn>(
self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn heed::RoTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
) -> ZResult<Option<&'txn [u8]>> { ) -> ZResult<Option<&'txn [u8]>> {
@ -44,18 +48,18 @@ impl DocumentsFields {
pub fn document_fields<'txn>( pub fn document_fields<'txn>(
self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn heed::RoTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>> { ) -> ZResult<DocumentFieldsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields.range(reader, start..=end)?; let iter = self.documents_fields.range(reader, &(start..=end))?;
Ok(DocumentFieldsIter { iter }) Ok(DocumentFieldsIter { iter })
} }
} }
pub struct DocumentFieldsIter<'txn> { pub struct DocumentFieldsIter<'txn> {
iter: zlmdb::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>, iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>,
} }
impl<'txn> Iterator for DocumentFieldsIter<'txn> { impl<'txn> Iterator for DocumentFieldsIter<'txn> {

View File

@ -1,18 +1,18 @@
use super::DocumentAttrKey; use super::DocumentAttrKey;
use crate::DocumentId; use crate::DocumentId;
use heed::types::OwnedType;
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use zlmdb::types::OwnedType;
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts { pub struct DocumentsFieldsCounts {
pub(crate) documents_fields_counts: zlmdb::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>, pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
} }
impl DocumentsFieldsCounts { impl DocumentsFieldsCounts {
pub fn put_document_field_count( pub fn put_document_field_count(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
value: u64, value: u64,
@ -23,18 +23,22 @@ impl DocumentsFieldsCounts {
pub fn del_all_document_fields_counts( pub fn del_all_document_fields_counts(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<usize> { ) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields_counts self.documents_fields_counts
.delete_range(writer, start..=end) .delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.documents_fields_counts.clear(writer)
} }
pub fn document_field_count( pub fn document_field_count(
self, self,
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
) -> ZResult<Option<u64>> { ) -> ZResult<Option<u64>> {
@ -47,19 +51,16 @@ impl DocumentsFieldsCounts {
pub fn document_fields_counts<'txn>( pub fn document_fields_counts<'txn>(
self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn heed::RoTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<DocumentFieldsCountsIter<'txn>> { ) -> ZResult<DocumentFieldsCountsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields_counts.range(reader, start..=end)?; let iter = self.documents_fields_counts.range(reader, &(start..=end))?;
Ok(DocumentFieldsCountsIter { iter }) Ok(DocumentFieldsCountsIter { iter })
} }
pub fn documents_ids<'txn>( pub fn documents_ids<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<DocumentsIdsIter<'txn>> {
self,
reader: &'txn zlmdb::RoTxn,
) -> ZResult<DocumentsIdsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?; let iter = self.documents_fields_counts.iter(reader)?;
Ok(DocumentsIdsIter { Ok(DocumentsIdsIter {
last_seen_id: None, last_seen_id: None,
@ -69,7 +70,7 @@ impl DocumentsFieldsCounts {
pub fn all_documents_fields_counts<'txn>( pub fn all_documents_fields_counts<'txn>(
self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn heed::RoTxn,
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> { ) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?; let iter = self.documents_fields_counts.iter(reader)?;
Ok(AllDocumentsFieldsCountsIter { iter }) Ok(AllDocumentsFieldsCountsIter { iter })
@ -77,7 +78,7 @@ impl DocumentsFieldsCounts {
} }
pub struct DocumentFieldsCountsIter<'txn> { pub struct DocumentFieldsCountsIter<'txn> {
iter: zlmdb::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>, iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
} }
impl Iterator for DocumentFieldsCountsIter<'_> { impl Iterator for DocumentFieldsCountsIter<'_> {
@ -97,7 +98,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
pub struct DocumentsIdsIter<'txn> { pub struct DocumentsIdsIter<'txn> {
last_seen_id: Option<DocumentId>, last_seen_id: Option<DocumentId>,
iter: zlmdb::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>, iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
} }
impl Iterator for DocumentsIdsIter<'_> { impl Iterator for DocumentsIdsIter<'_> {
@ -121,10 +122,10 @@ impl Iterator for DocumentsIdsIter<'_> {
} }
pub struct AllDocumentsFieldsCountsIter<'txn> { pub struct AllDocumentsFieldsCountsIter<'txn> {
iter: zlmdb::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>, iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
} }
impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> { impl Iterator for AllDocumentsFieldsCountsIter<'_> {
type Item = ZResult<(DocumentId, SchemaAttr, u64)>; type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {

View File

@ -1,28 +1,33 @@
use crate::RankedMap; use crate::RankedMap;
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
use heed::Result as ZResult;
use meilidb_schema::Schema; use meilidb_schema::Schema;
use std::sync::Arc; use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType, Serde, Str};
use zlmdb::Result as ZResult;
const CUSTOMS_KEY: &str = "customs-key"; const CUSTOMS_KEY: &str = "customs-key";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map"; const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema"; const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms"; const SYNONYMS_KEY: &str = "synonyms";
const STOP_WORDS_KEY: &str = "stop-words";
const WORDS_KEY: &str = "words"; const WORDS_KEY: &str = "words";
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct Main { pub struct Main {
pub(crate) main: zlmdb::DynDatabase, pub(crate) main: heed::PolyDatabase,
} }
impl Main { impl Main {
pub fn put_words_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.main.clear(writer)
}
pub fn put_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes(); let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes) self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes)
} }
pub fn words_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> { pub fn words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? { match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => { Some(bytes) => {
let len = bytes.len(); let len = bytes.len();
@ -34,31 +39,32 @@ impl Main {
} }
} }
pub fn put_schema(self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> { pub fn put_schema(self, writer: &mut heed::RwTxn, schema: &Schema) -> ZResult<()> {
self.main self.main
.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema) .put::<Str, SerdeBincode<Schema>>(writer, SCHEMA_KEY, schema)
} }
pub fn schema(self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> { pub fn schema(self, reader: &heed::RoTxn) -> ZResult<Option<Schema>> {
self.main.get::<Str, Serde<Schema>>(reader, SCHEMA_KEY)
}
pub fn put_ranked_map(self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
self.main self.main
.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map) .get::<Str, SerdeBincode<Schema>>(reader, SCHEMA_KEY)
} }
pub fn ranked_map(self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> { pub fn put_ranked_map(self, writer: &mut heed::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
self.main self.main
.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY) .put::<Str, SerdeBincode<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
} }
pub fn put_synonyms_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { pub fn ranked_map(self, reader: &heed::RoTxn) -> ZResult<Option<RankedMap>> {
self.main
.get::<Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
}
pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes(); let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes) self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
} }
pub fn synonyms_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> { pub fn synonyms_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? { match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => { Some(bytes) => {
let len = bytes.len(); let len = bytes.len();
@ -70,7 +76,25 @@ impl Main {
} }
} }
pub fn put_number_of_documents<F>(self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64> pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main
.put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
}
pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
where where
F: Fn(u64) -> u64, F: Fn(u64) -> u64,
{ {
@ -80,7 +104,7 @@ impl Main {
Ok(new) Ok(new)
} }
pub fn number_of_documents(self, reader: &zlmdb::RoTxn) -> ZResult<u64> { pub fn number_of_documents(self, reader: &heed::RoTxn) -> ZResult<u64> {
match self match self
.main .main
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)? .get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
@ -90,12 +114,12 @@ impl Main {
} }
} }
pub fn put_customs(self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> { pub fn put_customs(self, writer: &mut heed::RwTxn, customs: &[u8]) -> ZResult<()> {
self.main self.main
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs) .put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
} }
pub fn customs<'txn>(self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> { pub fn customs<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<Option<&'txn [u8]>> {
self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY) self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY)
} }
} }

View File

@ -20,12 +20,13 @@ pub use self::updates_results::UpdatesResults;
use std::collections::HashSet; use std::collections::HashSet;
use heed::Result as ZResult;
use meilidb_schema::{Schema, SchemaAttr}; use meilidb_schema::{Schema, SchemaAttr};
use serde::de; use serde::de::{self, Deserialize};
use zerocopy::{AsBytes, FromBytes}; use zerocopy::{AsBytes, FromBytes};
use zlmdb::Result as ZResult;
use crate::criterion::Criteria; use crate::criterion::Criteria;
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::serde::Deserializer; use crate::serde::Deserializer;
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult}; use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
@ -91,13 +92,13 @@ pub struct Index {
pub updates: Updates, pub updates: Updates,
pub updates_results: UpdatesResults, pub updates_results: UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
} }
impl Index { impl Index {
pub fn document<T: de::DeserializeOwned>( pub fn document<T: de::DeserializeOwned>(
&self, &self,
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
attributes: Option<&HashSet<&str>>, attributes: Option<&HashSet<&str>>,
document_id: DocumentId, document_id: DocumentId,
) -> MResult<Option<T>> { ) -> MResult<Option<T>> {
@ -120,14 +121,12 @@ impl Index {
attributes: attributes.as_ref(), attributes: attributes.as_ref(),
}; };
// TODO: currently we return an error if all document fields are missing, Ok(Option::<T>::deserialize(&mut deserializer)?)
// returning None would have been better
Ok(T::deserialize(&mut deserializer).map(Some)?)
} }
pub fn document_attribute<T: de::DeserializeOwned>( pub fn document_attribute<T: de::DeserializeOwned>(
&self, &self,
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
) -> MResult<Option<T>> { ) -> MResult<Option<T>> {
@ -140,13 +139,13 @@ impl Index {
} }
} }
pub fn schema_update(&self, writer: &mut zlmdb::RwTxn, schema: Schema) -> MResult<u64> { pub fn schema_update(&self, writer: &mut heed::RwTxn, schema: Schema) -> MResult<u64> {
let _ = self.updates_notifier.send(()); let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
update::push_schema_update(writer, self.updates, self.updates_results, schema) update::push_schema_update(writer, self.updates, self.updates_results, schema)
} }
pub fn customs_update(&self, writer: &mut zlmdb::RwTxn, customs: Vec<u8>) -> ZResult<u64> { pub fn customs_update(&self, writer: &mut heed::RwTxn, customs: Vec<u8>) -> ZResult<u64> {
let _ = self.updates_notifier.send(()); let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
update::push_customs_update(writer, self.updates, self.updates_results, customs) update::push_customs_update(writer, self.updates, self.updates_results, customs)
} }
@ -158,6 +157,14 @@ impl Index {
) )
} }
pub fn documents_partial_addition<D>(&self) -> update::DocumentsAddition<D> {
update::DocumentsAddition::new_partial(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn documents_deletion(&self) -> update::DocumentsDeletion { pub fn documents_deletion(&self) -> update::DocumentsDeletion {
update::DocumentsDeletion::new( update::DocumentsDeletion::new(
self.updates, self.updates,
@ -166,6 +173,11 @@ impl Index {
) )
} }
pub fn clear_all(&self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
update::push_clear_all(writer, self.updates, self.updates_results)
}
pub fn synonyms_addition(&self) -> update::SynonymsAddition { pub fn synonyms_addition(&self) -> update::SynonymsAddition {
update::SynonymsAddition::new( update::SynonymsAddition::new(
self.updates, self.updates,
@ -182,7 +194,23 @@ impl Index {
) )
} }
pub fn current_update_id(&self, reader: &zlmdb::RoTxn) -> MResult<Option<u64>> { pub fn stop_words_addition(&self) -> update::StopWordsAddition {
update::StopWordsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn stop_words_deletion(&self) -> update::StopWordsDeletion {
update::StopWordsDeletion::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
match self.updates.last_update_id(reader)? { match self.updates.last_update_id(reader)? {
Some((id, _)) => Ok(Some(id)), Some((id, _)) => Ok(Some(id)),
None => Ok(None), None => Ok(None),
@ -191,12 +219,38 @@ impl Index {
pub fn update_status( pub fn update_status(
&self, &self,
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
update_id: u64, update_id: u64,
) -> MResult<update::UpdateStatus> { ) -> MResult<update::UpdateStatus> {
update::update_status(reader, self.updates, self.updates_results, update_id) update::update_status(reader, self.updates, self.updates_results, update_id)
} }
pub fn all_updates_status(&self, reader: &heed::RoTxn) -> MResult<Vec<update::UpdateStatus>> {
let mut updates = Vec::new();
let mut last_update_result_id = 0;
// retrieve all updates results
if let Some((last_id, _)) = self.updates_results.last_update_id(reader)? {
updates.reserve(last_id as usize);
for id in 0..=last_id {
let update = self.update_status(reader, id)?;
updates.push(update);
last_update_result_id = id;
}
}
// retrieve all enqueued updates
if let Some((last_id, _)) = self.updates.last_update_id(reader)? {
for id in last_update_result_id + 1..last_id {
let update = self.update_status(reader, id)?;
updates.push(update);
}
}
Ok(updates)
}
pub fn query_builder(&self) -> QueryBuilder { pub fn query_builder(&self) -> QueryBuilder {
QueryBuilder::new( QueryBuilder::new(
self.main, self.main,
@ -221,9 +275,9 @@ impl Index {
} }
pub fn create( pub fn create(
env: &zlmdb::Env, env: &heed::Env,
name: &str, name: &str,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
) -> MResult<Index> { ) -> MResult<Index> {
// create all the store names // create all the store names
let main_name = main_name(name); let main_name = main_name(name);
@ -236,7 +290,7 @@ pub fn create(
let updates_results_name = updates_results_name(name); let updates_results_name = updates_results_name(name);
// open all the stores // open all the stores
let main = env.create_dyn_database(Some(&main_name))?; let main = env.create_poly_database(Some(&main_name))?;
let postings_lists = env.create_database(Some(&postings_lists_name))?; let postings_lists = env.create_database(Some(&postings_lists_name))?;
let documents_fields = env.create_database(Some(&documents_fields_name))?; let documents_fields = env.create_database(Some(&documents_fields_name))?;
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?; let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
@ -261,9 +315,9 @@ pub fn create(
} }
pub fn open( pub fn open(
env: &zlmdb::Env, env: &heed::Env,
name: &str, name: &str,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
) -> MResult<Option<Index>> { ) -> MResult<Option<Index>> {
// create all the store names // create all the store names
let main_name = main_name(name); let main_name = main_name(name);
@ -276,7 +330,7 @@ pub fn open(
let updates_results_name = updates_results_name(name); let updates_results_name = updates_results_name(name);
// open all the stores // open all the stores
let main = match env.open_dyn_database(Some(&main_name))? { let main = match env.open_poly_database(Some(&main_name))? {
Some(main) => main, Some(main) => main,
None => return Ok(None), None => return Ok(None),
}; };
@ -323,3 +377,19 @@ pub fn open(
updates_notifier, updates_notifier,
})) }))
} }
pub fn clear(writer: &mut heed::RwTxn, index: &Index) -> MResult<()> {
// send a stop event to the update loop of the index
index.updates_notifier.send(UpdateEvent::MustStop).unwrap();
// clear all the stores
index.main.clear(writer)?;
index.postings_lists.clear(writer)?;
index.documents_fields.clear(writer)?;
index.documents_fields_counts.clear(writer)?;
index.synonyms.clear(writer)?;
index.docs_words.clear(writer)?;
index.updates.clear(writer)?;
index.updates_results.clear(writer)?;
Ok(())
}

View File

@ -1,31 +1,35 @@
use crate::DocIndex; use crate::DocIndex;
use heed::types::{ByteSlice, CowSlice};
use heed::Result as ZResult;
use sdset::{Set, SetBuf}; use sdset::{Set, SetBuf};
use std::borrow::Cow; use std::borrow::Cow;
use zlmdb::types::{ByteSlice, CowSlice};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct PostingsLists { pub struct PostingsLists {
pub(crate) postings_lists: zlmdb::Database<ByteSlice, CowSlice<DocIndex>>, pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
} }
impl PostingsLists { impl PostingsLists {
pub fn put_postings_list( pub fn put_postings_list(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
word: &[u8], word: &[u8],
words_indexes: &Set<DocIndex>, words_indexes: &Set<DocIndex>,
) -> ZResult<()> { ) -> ZResult<()> {
self.postings_lists.put(writer, word, words_indexes) self.postings_lists.put(writer, word, words_indexes)
} }
pub fn del_postings_list(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> { pub fn del_postings_list(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
self.postings_lists.delete(writer, word) self.postings_lists.delete(writer, word)
} }
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.postings_lists.clear(writer)
}
pub fn postings_list<'txn>( pub fn postings_list<'txn>(
self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn heed::RoTxn,
word: &[u8], word: &[u8],
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> { ) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
match self.postings_lists.get(reader, word)? { match self.postings_lists.get(reader, word)? {

View File

@ -1,16 +1,16 @@
use heed::types::ByteSlice;
use heed::Result as ZResult;
use std::sync::Arc; use std::sync::Arc;
use zlmdb::types::ByteSlice;
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct Synonyms { pub struct Synonyms {
pub(crate) synonyms: zlmdb::Database<ByteSlice, ByteSlice>, pub(crate) synonyms: heed::Database<ByteSlice, ByteSlice>,
} }
impl Synonyms { impl Synonyms {
pub fn put_synonyms( pub fn put_synonyms(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
word: &[u8], word: &[u8],
synonyms: &fst::Set, synonyms: &fst::Set,
) -> ZResult<()> { ) -> ZResult<()> {
@ -18,11 +18,15 @@ impl Synonyms {
self.synonyms.put(writer, word, bytes) self.synonyms.put(writer, word, bytes)
} }
pub fn del_synonyms(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> { pub fn del_synonyms(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
self.synonyms.delete(writer, word) self.synonyms.delete(writer, word)
} }
pub fn synonyms(self, reader: &zlmdb::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> { pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.synonyms.clear(writer)
}
pub fn synonyms(self, reader: &heed::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
match self.synonyms.get(reader, word)? { match self.synonyms.get(reader, word)? {
Some(bytes) => { Some(bytes) => {
let len = bytes.len(); let len = bytes.len();

View File

@ -1,42 +1,16 @@
use super::BEU64; use super::BEU64;
use crate::update::Update; use crate::update::Update;
use serde::{Deserialize, Serialize}; use heed::types::{OwnedType, SerdeJson};
use std::borrow::Cow; use heed::Result as ZResult;
use zlmdb::types::OwnedType;
use zlmdb::{BytesDecode, BytesEncode, Result as ZResult};
pub struct SerdeJson<T>(std::marker::PhantomData<T>);
impl<T> BytesEncode for SerdeJson<T>
where
T: Serialize,
{
type EItem = T;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
serde_json::to_vec(item).map(Cow::Owned).ok()
}
}
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T>
where
T: Deserialize<'a> + Clone,
{
type DItem = T;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
serde_json::from_slice(bytes).ok()
}
}
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct Updates { pub struct Updates {
pub(crate) updates: zlmdb::Database<OwnedType<BEU64>, SerdeJson<Update>>, pub(crate) updates: heed::Database<OwnedType<BEU64>, SerdeJson<Update>>,
} }
impl Updates { impl Updates {
// TODO do not trigger deserialize if possible // TODO do not trigger deserialize if possible
pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> { pub fn last_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.last(reader)? { match self.updates.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))), Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None), None => Ok(None),
@ -44,7 +18,7 @@ impl Updates {
} }
// TODO do not trigger deserialize if possible // TODO do not trigger deserialize if possible
fn first_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> { fn first_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.first(reader)? { match self.updates.first(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))), Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None), None => Ok(None),
@ -52,14 +26,14 @@ impl Updates {
} }
// TODO do not trigger deserialize if possible // TODO do not trigger deserialize if possible
pub fn contains(self, reader: &zlmdb::RoTxn, update_id: u64) -> ZResult<bool> { pub fn get(self, reader: &heed::RoTxn, update_id: u64) -> ZResult<Option<Update>> {
let update_id = BEU64::new(update_id); let update_id = BEU64::new(update_id);
self.updates.get(reader, &update_id).map(|v| v.is_some()) self.updates.get(reader, &update_id)
} }
pub fn put_update( pub fn put_update(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
update_id: u64, update_id: u64,
update: &Update, update: &Update,
) -> ZResult<()> { ) -> ZResult<()> {
@ -68,7 +42,7 @@ impl Updates {
self.updates.put(writer, &update_id, update) self.updates.put(writer, &update_id, update)
} }
pub fn pop_front(self, writer: &mut zlmdb::RwTxn) -> ZResult<Option<(u64, Update)>> { pub fn pop_front(self, writer: &mut heed::RwTxn) -> ZResult<Option<(u64, Update)>> {
match self.first_update_id(writer)? { match self.first_update_id(writer)? {
Some((update_id, update)) => { Some((update_id, update)) => {
let key = BEU64::new(update_id); let key = BEU64::new(update_id);
@ -78,4 +52,8 @@ impl Updates {
None => Ok(None), None => Ok(None),
} }
} }
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.updates.clear(writer)
}
} }

View File

@ -1,15 +1,19 @@
use super::BEU64; use super::BEU64;
use crate::update::UpdateResult; use crate::update::ProcessedUpdateResult;
use zlmdb::types::{OwnedType, Serde}; use heed::types::{OwnedType, SerdeBincode};
use zlmdb::Result as ZResult; use heed::Result as ZResult;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct UpdatesResults { pub struct UpdatesResults {
pub(crate) updates_results: zlmdb::Database<OwnedType<BEU64>, Serde<UpdateResult>>, pub(crate) updates_results:
heed::Database<OwnedType<BEU64>, SerdeBincode<ProcessedUpdateResult>>,
} }
impl UpdatesResults { impl UpdatesResults {
pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, UpdateResult)>> { pub fn last_update_id(
self,
reader: &heed::RoTxn,
) -> ZResult<Option<(u64, ProcessedUpdateResult)>> {
match self.updates_results.last(reader)? { match self.updates_results.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))), Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None), None => Ok(None),
@ -18,9 +22,9 @@ impl UpdatesResults {
pub fn put_update_result( pub fn put_update_result(
self, self,
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
update_id: u64, update_id: u64,
update_result: &UpdateResult, update_result: &ProcessedUpdateResult,
) -> ZResult<()> { ) -> ZResult<()> {
let update_id = BEU64::new(update_id); let update_id = BEU64::new(update_id);
self.updates_results.put(writer, &update_id, update_result) self.updates_results.put(writer, &update_id, update_result)
@ -28,10 +32,14 @@ impl UpdatesResults {
pub fn update_result( pub fn update_result(
self, self,
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
update_id: u64, update_id: u64,
) -> ZResult<Option<UpdateResult>> { ) -> ZResult<Option<ProcessedUpdateResult>> {
let update_id = BEU64::new(update_id); let update_id = BEU64::new(update_id);
self.updates_results.get(reader, &update_id) self.updates_results.get(reader, &update_id)
} }
pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> {
self.updates_results.clear(writer)
}
} }

View File

@ -0,0 +1,33 @@
use crate::update::{next_update_id, Update};
use crate::{store, MResult, RankedMap};
pub fn apply_clear_all(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
main_store.put_words_fst(writer, &fst::Set::default())?;
main_store.put_ranked_map(writer, &RankedMap::default())?;
main_store.put_number_of_documents(writer, |_| 0)?;
documents_fields_store.clear(writer)?;
documents_fields_counts_store.clear(writer)?;
postings_lists_store.clear(writer)?;
docs_words_store.clear(writer)?;
Ok(())
}
pub fn push_clear_all(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::ClearAll;
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}

View File

@ -1,9 +1,9 @@
use crate::store; use crate::store;
use crate::update::{next_update_id, Update}; use crate::update::{next_update_id, Update};
use zlmdb::Result as ZResult; use heed::Result as ZResult;
pub fn apply_customs_update( pub fn apply_customs_update(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
main_store: store::Main, main_store: store::Main,
customs: &[u8], customs: &[u8],
) -> ZResult<()> { ) -> ZResult<()> {
@ -11,7 +11,7 @@ pub fn apply_customs_update(
} }
pub fn push_customs_update( pub fn push_customs_update(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
customs: Vec<u8>, customs: Vec<u8>,

View File

@ -1,11 +1,12 @@
use std::collections::{HashMap, HashSet}; use std::collections::HashMap;
use fst::{set::OpBuilder, SetBuilder}; use fst::{set::OpBuilder, SetBuilder};
use sdset::{duo::Union, SetOperation}; use sdset::{duo::Union, SetOperation};
use serde::Serialize; use serde::{Deserialize, Serialize};
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::raw_indexer::RawIndexer; use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, RamDocumentStore, Serializer}; use crate::serde::{extract_document_id, serialize_value, Deserializer, Serializer};
use crate::store; use crate::store;
use crate::update::{apply_documents_deletion, next_update_id, Update}; use crate::update::{apply_documents_deletion, next_update_id, Update};
use crate::{Error, MResult, RankedMap}; use crate::{Error, MResult, RankedMap};
@ -13,21 +14,37 @@ use crate::{Error, MResult, RankedMap};
pub struct DocumentsAddition<D> { pub struct DocumentsAddition<D> {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
documents: Vec<D>, documents: Vec<D>,
is_partial: bool,
} }
impl<D> DocumentsAddition<D> { impl<D> DocumentsAddition<D> {
pub fn new( pub fn new(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
) -> DocumentsAddition<D> { ) -> DocumentsAddition<D> {
DocumentsAddition { DocumentsAddition {
updates_store, updates_store,
updates_results_store, updates_results_store,
updates_notifier, updates_notifier,
documents: Vec::new(), documents: Vec::new(),
is_partial: false,
}
}
pub fn new_partial(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
updates_notifier,
documents: Vec::new(),
is_partial: true,
} }
} }
@ -35,16 +52,17 @@ impl<D> DocumentsAddition<D> {
self.documents.push(document); self.documents.push(document);
} }
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64>
where where
D: serde::Serialize, D: serde::Serialize,
{ {
let _ = self.updates_notifier.send(()); let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_documents_addition( let update_id = push_documents_addition(
writer, writer,
self.updates_store, self.updates_store,
self.updates_results_store, self.updates_results_store,
self.documents, self.documents,
self.is_partial,
)?; )?;
Ok(update_id) Ok(update_id)
} }
@ -57,10 +75,11 @@ impl<D> Extend<D> for DocumentsAddition<D> {
} }
pub fn push_documents_addition<D: serde::Serialize>( pub fn push_documents_addition<D: serde::Serialize>(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
addition: Vec<D>, addition: Vec<D>,
is_partial: bool,
) -> MResult<u64> { ) -> MResult<u64> {
let mut values = Vec::with_capacity(addition.len()); let mut values = Vec::with_capacity(addition.len());
for add in addition { for add in addition {
@ -71,26 +90,27 @@ pub fn push_documents_addition<D: serde::Serialize>(
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::DocumentsAddition(values); let update = if is_partial {
Update::DocumentsPartial(values)
} else {
Update::DocumentsAddition(values)
};
updates_store.put_update(writer, last_update_id, &update)?; updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id) Ok(last_update_id)
} }
pub fn apply_documents_addition( pub fn apply_documents_addition<'a, 'b>(
writer: &mut zlmdb::RwTxn, writer: &'a mut heed::RwTxn<'b>,
main_store: store::Main, main_store: store::Main,
documents_fields_store: store::DocumentsFields, documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts, documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists, postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords, docs_words_store: store::DocsWords,
mut ranked_map: RankedMap, addition: Vec<HashMap<String, serde_json::Value>>,
addition: Vec<serde_json::Value>,
) -> MResult<()> { ) -> MResult<()> {
let mut document_ids = HashSet::new(); let mut documents_additions = HashMap::new();
let mut document_store = RamDocumentStore::new();
let mut document_fields_counts = HashMap::new();
let mut indexer = RawIndexer::new();
let schema = match main_store.schema(writer)? { let schema = match main_store.schema(writer)? {
Some(schema) => schema, Some(schema) => schema,
@ -99,20 +119,48 @@ pub fn apply_documents_addition(
let identifier = schema.identifier_name(); let identifier = schema.identifier_name();
// 1. store documents ids for future deletion
for document in addition { for document in addition {
let document_id = match extract_document_id(identifier, &document)? { let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id, Some(id) => id,
None => return Err(Error::MissingDocumentId), None => return Err(Error::MissingDocumentId),
}; };
// 1. store the document id for future deletion documents_additions.insert(document_id, document);
document_ids.insert(document_id); }
// 2. index the document fields in ram stores // 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
documents_ids,
)?;
let mut ranked_map = match main_store.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer { let serializer = Serializer {
txn: writer,
schema: &schema, schema: &schema,
document_store: &mut document_store, document_store: documents_fields_store,
document_fields_counts: &mut document_fields_counts, document_fields_counts: documents_fields_counts_store,
indexer: &mut indexer, indexer: &mut indexer,
ranked_map: &mut ranked_map, ranked_map: &mut ranked_map,
document_id, document_id,
@ -121,8 +169,65 @@ pub fn apply_documents_addition(
document.serialize(serializer)?; document.serialize(serializer)?;
} }
// 1. remove the previous documents match indexes write_documents_addition_index(
let documents_to_insert = document_ids.iter().cloned().collect(); writer,
main_store,
postings_lists_store,
docs_words_store,
&ranked_map,
number_of_inserted_documents,
indexer,
)
}
pub fn apply_documents_partial_addition<'a, 'b>(
writer: &'a mut heed::RwTxn<'b>,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
addition: Vec<HashMap<String, serde_json::Value>>,
) -> MResult<()> {
let mut documents_additions = HashMap::new();
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let identifier = schema.identifier_name();
// 1. store documents ids for future deletion
for mut document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
let mut deserializer = Deserializer {
document_id,
reader: writer,
documents_fields: documents_fields_store,
schema: &schema,
attributes: None,
};
// retrieve the old document and
// update the new one with missing keys found in the old one
let result = Option::<HashMap<String, serde_json::Value>>::deserialize(&mut deserializer)?;
if let Some(old_document) = result {
for (key, value) in old_document {
document.entry(key).or_insert(value);
}
}
documents_additions.insert(document_id, document);
}
// 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion( apply_documents_deletion(
writer, writer,
main_store, main_store,
@ -130,20 +235,133 @@ pub fn apply_documents_addition(
documents_fields_counts_store, documents_fields_counts_store,
postings_lists_store, postings_lists_store,
docs_words_store, docs_words_store,
ranked_map.clone(), documents_ids,
documents_to_insert,
)?; )?;
// 2. insert new document attributes in the database let mut ranked_map = match main_store.ranked_map(writer)? {
for ((id, attr), value) in document_store.into_inner() { Some(ranked_map) => ranked_map,
documents_fields_store.put_document_field(writer, id, attr, &value)?; None => RankedMap::default(),
};
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer {
txn: writer,
schema: &schema,
document_store: documents_fields_store,
document_fields_counts: documents_fields_counts_store,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
} }
// 3. insert new document attributes counts write_documents_addition_index(
for ((id, attr), count) in document_fields_counts { writer,
documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?; main_store,
postings_lists_store,
docs_words_store,
&ranked_map,
number_of_inserted_documents,
indexer,
)
}
pub fn reindex_all_documents(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> {
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = RankedMap::default();
// 1. retrieve all documents ids
let mut documents_ids_to_reindex = Vec::new();
for result in documents_fields_counts_store.documents_ids(writer)? {
let document_id = result?;
documents_ids_to_reindex.push(document_id);
} }
// 2. remove the documents posting lists
main_store.put_words_fst(writer, &fst::Set::default())?;
main_store.put_ranked_map(writer, &ranked_map)?;
main_store.put_number_of_documents(writer, |_| 0)?;
postings_lists_store.clear(writer)?;
docs_words_store.clear(writer)?;
// 3. re-index chunks of documents (otherwise we make the borrow checker unhappy)
for documents_ids in documents_ids_to_reindex.chunks(100) {
let stop_words = match main_store.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
let number_of_inserted_documents = documents_ids.len();
let mut indexer = RawIndexer::new(stop_words);
let mut ram_store = HashMap::new();
for document_id in documents_ids {
for result in documents_fields_store.document_fields(writer, *document_id)? {
let (attr, bytes) = result?;
let value: serde_json::Value = serde_json::from_slice(bytes)?;
ram_store.insert((document_id, attr), value);
}
for ((docid, attr), value) in ram_store.drain() {
serialize_value(
writer,
attr,
schema.props(attr),
*docid,
documents_fields_store,
documents_fields_counts_store,
&mut indexer,
&mut ranked_map,
&value,
)?;
}
}
// 4. write the new index in the main store
write_documents_addition_index(
writer,
main_store,
postings_lists_store,
docs_words_store,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
}
Ok(())
}
pub fn write_documents_addition_index(
writer: &mut heed::RwTxn,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer,
) -> MResult<()> {
let indexed = indexer.build(); let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory(); let mut delta_words_builder = SetBuilder::memory();
@ -185,10 +403,8 @@ pub fn apply_documents_addition(
}; };
main_store.put_words_fst(writer, &words)?; main_store.put_words_fst(writer, &words)?;
main_store.put_ranked_map(writer, &ranked_map)?; main_store.put_ranked_map(writer, ranked_map)?;
main_store.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
let inserted_documents_len = document_ids.len() as u64;
main_store.put_number_of_documents(writer, |old| old + inserted_documents_len)?;
Ok(()) Ok(())
} }

View File

@ -4,6 +4,7 @@ use fst::{SetBuilder, Streamer};
use meilidb_schema::Schema; use meilidb_schema::Schema;
use sdset::{duo::DifferenceByKey, SetBuf, SetOperation}; use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::serde::extract_document_id; use crate::serde::extract_document_id;
use crate::store; use crate::store;
use crate::update::{next_update_id, Update}; use crate::update::{next_update_id, Update};
@ -12,7 +13,7 @@ use crate::{DocumentId, Error, MResult, RankedMap};
pub struct DocumentsDeletion { pub struct DocumentsDeletion {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
documents: Vec<DocumentId>, documents: Vec<DocumentId>,
} }
@ -20,7 +21,7 @@ impl DocumentsDeletion {
pub fn new( pub fn new(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
) -> DocumentsDeletion { ) -> DocumentsDeletion {
DocumentsDeletion { DocumentsDeletion {
updates_store, updates_store,
@ -49,8 +50,8 @@ impl DocumentsDeletion {
Ok(()) Ok(())
} }
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> { pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(()); let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_documents_deletion( let update_id = push_documents_deletion(
writer, writer,
self.updates_store, self.updates_store,
@ -68,7 +69,7 @@ impl Extend<DocumentId> for DocumentsDeletion {
} }
pub fn push_documents_deletion( pub fn push_documents_deletion(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
deletion: Vec<DocumentId>, deletion: Vec<DocumentId>,
@ -82,13 +83,12 @@ pub fn push_documents_deletion(
} }
pub fn apply_documents_deletion( pub fn apply_documents_deletion(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
main_store: store::Main, main_store: store::Main,
documents_fields_store: store::DocumentsFields, documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts, documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists, postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords, docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
deletion: Vec<DocumentId>, deletion: Vec<DocumentId>,
) -> MResult<()> { ) -> MResult<()> {
let idset = SetBuf::from_dirty(deletion); let idset = SetBuf::from_dirty(deletion);
@ -98,6 +98,11 @@ pub fn apply_documents_deletion(
None => return Err(Error::SchemaMissing), None => return Err(Error::SchemaMissing),
}; };
let mut ranked_map = match main_store.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
// collect the ranked attributes according to the schema // collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema let ranked_attrs: Vec<_> = schema
.iter() .iter()
@ -181,7 +186,6 @@ pub fn apply_documents_deletion(
main_store.put_words_fst(writer, &words)?; main_store.put_words_fst(writer, &words)?;
main_store.put_ranked_map(writer, &ranked_map)?; main_store.put_ranked_map(writer, &ranked_map)?;
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?; main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
Ok(()) Ok(())

View File

@ -1,70 +1,125 @@
mod clear_all;
mod customs_update; mod customs_update;
mod documents_addition; mod documents_addition;
mod documents_deletion; mod documents_deletion;
mod schema_update; mod schema_update;
mod stop_words_addition;
mod stop_words_deletion;
mod synonyms_addition; mod synonyms_addition;
mod synonyms_deletion; mod synonyms_deletion;
pub use self::clear_all::{apply_clear_all, push_clear_all};
pub use self::customs_update::{apply_customs_update, push_customs_update}; pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition}; pub use self::documents_addition::{
apply_documents_addition, apply_documents_partial_addition, DocumentsAddition,
};
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update}; pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion};
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition}; pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion}; pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::cmp; use std::cmp;
use std::collections::BTreeMap; use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use heed::Result as ZResult;
use log::debug; use log::debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use zlmdb::Result as ZResult;
use crate::{store, DocumentId, MResult, RankedMap}; use crate::{store, DocumentId, MResult};
use meilidb_schema::Schema; use meilidb_schema::Schema;
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Update { pub enum Update {
ClearAll,
Schema(Schema), Schema(Schema),
Customs(Vec<u8>), Customs(Vec<u8>),
DocumentsAddition(Vec<serde_json::Value>), DocumentsAddition(Vec<HashMap<String, serde_json::Value>>),
DocumentsPartial(Vec<HashMap<String, serde_json::Value>>),
DocumentsDeletion(Vec<DocumentId>), DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>), SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>), SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
StopWordsAddition(BTreeSet<String>),
StopWordsDeletion(BTreeSet<String>),
}
impl Update {
pub fn update_type(&self) -> UpdateType {
match self {
Update::ClearAll => UpdateType::ClearAll,
Update::Schema(schema) => UpdateType::Schema {
schema: schema.clone(),
},
Update::Customs(_) => UpdateType::Customs,
Update::DocumentsAddition(addition) => UpdateType::DocumentsAddition {
number: addition.len(),
},
Update::DocumentsPartial(addition) => UpdateType::DocumentsPartial {
number: addition.len(),
},
Update::DocumentsDeletion(deletion) => UpdateType::DocumentsDeletion {
number: deletion.len(),
},
Update::SynonymsAddition(addition) => UpdateType::SynonymsAddition {
number: addition.len(),
},
Update::SynonymsDeletion(deletion) => UpdateType::SynonymsDeletion {
number: deletion.len(),
},
Update::StopWordsAddition(addition) => UpdateType::StopWordsAddition {
number: addition.len(),
},
Update::StopWordsDeletion(deletion) => UpdateType::StopWordsDeletion {
number: deletion.len(),
},
}
}
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateType { pub enum UpdateType {
ClearAll,
Schema { schema: Schema }, Schema { schema: Schema },
Customs, Customs,
DocumentsAddition { number: usize }, DocumentsAddition { number: usize },
DocumentsPartial { number: usize },
DocumentsDeletion { number: usize }, DocumentsDeletion { number: usize },
SynonymsAddition { number: usize }, SynonymsAddition { number: usize },
SynonymsDeletion { number: usize }, SynonymsDeletion { number: usize },
StopWordsAddition { number: usize },
StopWordsDeletion { number: usize },
} }
#[derive(Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetailedDuration { pub struct DetailedDuration {
pub main: Duration, pub main: Duration,
} }
#[derive(Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UpdateResult { pub struct ProcessedUpdateResult {
pub update_id: u64, pub update_id: u64,
pub update_type: UpdateType, pub update_type: UpdateType,
pub result: Result<(), String>, pub result: Result<(), String>,
pub detailed_duration: DetailedDuration, pub detailed_duration: DetailedDuration,
} }
#[derive(Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnqueuedUpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateStatus { pub enum UpdateStatus {
Enqueued, Enqueued(EnqueuedUpdateResult),
Processed(UpdateResult), Processed(ProcessedUpdateResult),
Unknown, Unknown,
} }
pub fn update_status( pub fn update_status(
reader: &zlmdb::RoTxn, reader: &heed::RoTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
update_id: u64, update_id: u64,
@ -72,8 +127,11 @@ pub fn update_status(
match updates_results_store.update_result(reader, update_id)? { match updates_results_store.update_result(reader, update_id)? {
Some(result) => Ok(UpdateStatus::Processed(result)), Some(result) => Ok(UpdateStatus::Processed(result)),
None => { None => {
if updates_store.contains(reader, update_id)? { if let Some(update) = updates_store.get(reader, update_id)? {
Ok(UpdateStatus::Enqueued) Ok(UpdateStatus::Enqueued(EnqueuedUpdateResult {
update_id,
update_type: update.update_type(),
}))
} else { } else {
Ok(UpdateStatus::Unknown) Ok(UpdateStatus::Unknown)
} }
@ -82,7 +140,7 @@ pub fn update_status(
} }
pub fn next_update_id( pub fn next_update_id(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
) -> ZResult<u64> { ) -> ZResult<u64> {
@ -98,25 +156,45 @@ pub fn next_update_id(
Ok(new_update_id) Ok(new_update_id)
} }
pub fn update_task( pub fn update_task<'a, 'b>(
writer: &mut zlmdb::RwTxn, writer: &'a mut heed::RwTxn<'b>,
index: store::Index, index: store::Index,
) -> MResult<Option<UpdateResult>> { update_id: u64,
let (update_id, update) = match index.updates.pop_front(writer)? { update: Update,
Some(value) => value, ) -> MResult<ProcessedUpdateResult> {
None => return Ok(None),
};
debug!("Processing update number {}", update_id); debug!("Processing update number {}", update_id);
let (update_type, result, duration) = match update { let (update_type, result, duration) = match update {
Update::ClearAll => {
let start = Instant::now();
let update_type = UpdateType::ClearAll;
let result = apply_clear_all(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
);
(update_type, result, start.elapsed())
}
Update::Schema(schema) => { Update::Schema(schema) => {
let start = Instant::now(); let start = Instant::now();
let update_type = UpdateType::Schema { let update_type = UpdateType::Schema {
schema: schema.clone(), schema: schema.clone(),
}; };
let result = apply_schema_update(writer, index.main, &schema); let result = apply_schema_update(
writer,
&schema,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
);
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
} }
@ -131,11 +209,6 @@ pub fn update_task(
Update::DocumentsAddition(documents) => { Update::DocumentsAddition(documents) => {
let start = Instant::now(); let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsAddition { let update_type = UpdateType::DocumentsAddition {
number: documents.len(), number: documents.len(),
}; };
@ -147,7 +220,25 @@ pub fn update_task(
index.documents_fields_counts, index.documents_fields_counts,
index.postings_lists, index.postings_lists,
index.docs_words, index.docs_words,
ranked_map, documents,
);
(update_type, result, start.elapsed())
}
Update::DocumentsPartial(documents) => {
let start = Instant::now();
let update_type = UpdateType::DocumentsPartial {
number: documents.len(),
};
let result = apply_documents_partial_addition(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
documents, documents,
); );
@ -156,11 +247,6 @@ pub fn update_task(
Update::DocumentsDeletion(documents) => { Update::DocumentsDeletion(documents) => {
let start = Instant::now(); let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsDeletion { let update_type = UpdateType::DocumentsDeletion {
number: documents.len(), number: documents.len(),
}; };
@ -172,7 +258,6 @@ pub fn update_task(
index.documents_fields_counts, index.documents_fields_counts,
index.postings_lists, index.postings_lists,
index.docs_words, index.docs_words,
ranked_map,
documents, documents,
); );
@ -198,6 +283,37 @@ pub fn update_task(
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms); let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
}
Update::StopWordsAddition(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsAddition {
number: stop_words.len(),
};
let result =
apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
(update_type, result, start.elapsed())
}
Update::StopWordsDeletion(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsDeletion {
number: stop_words.len(),
};
let result = apply_stop_words_deletion(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
stop_words,
);
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
} }
}; };
@ -208,16 +324,12 @@ pub fn update_task(
); );
let detailed_duration = DetailedDuration { main: duration }; let detailed_duration = DetailedDuration { main: duration };
let status = UpdateResult { let status = ProcessedUpdateResult {
update_id, update_id,
update_type, update_type,
result: result.map_err(|e| e.to_string()), result: result.map_err(|e| e.to_string()),
detailed_duration, detailed_duration,
}; };
index Ok(status)
.updates_results
.put_update_result(writer, update_id, &status)?;
Ok(Some(status))
} }

View File

@ -1,23 +1,67 @@
use meilidb_schema::{Diff, Schema};
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update}; use crate::update::{next_update_id, Update};
use crate::{error::UnsupportedOperation, store, MResult}; use crate::{error::UnsupportedOperation, store, MResult};
use meilidb_schema::Schema;
pub fn apply_schema_update( pub fn apply_schema_update(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
main_store: store::Main,
new_schema: &Schema, new_schema: &Schema,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
) -> MResult<()> { ) -> MResult<()> {
if main_store.schema(writer)?.is_some() { use UnsupportedOperation::{
return Err(UnsupportedOperation::SchemaAlreadyExists.into()); CanOnlyIntroduceNewSchemaAttributesAtEnd, CannotRemoveSchemaAttribute,
CannotReorderSchemaAttribute, CannotUpdateSchemaIdentifier,
};
let mut need_full_reindexing = false;
if let Some(old_schema) = main_store.schema(writer)? {
for diff in meilidb_schema::diff(&old_schema, new_schema) {
match diff {
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
Diff::AttrMove { .. } => return Err(CannotReorderSchemaAttribute.into()),
Diff::AttrPropsChange { old, new, .. } => {
if new.indexed != old.indexed {
need_full_reindexing = true;
}
if new.ranked != old.ranked {
need_full_reindexing = true;
}
}
Diff::NewAttr { pos, .. } => {
// new attribute not at the end of the schema
if pos < old_schema.number_of_attributes() {
return Err(CanOnlyIntroduceNewSchemaAttributesAtEnd.into());
}
}
Diff::RemovedAttr { .. } => return Err(CannotRemoveSchemaAttribute.into()),
}
}
} }
main_store main_store.put_schema(writer, new_schema)?;
.put_schema(writer, new_schema)
.map_err(Into::into) if need_full_reindexing {
reindex_all_documents(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
)?
}
Ok(())
} }
pub fn push_schema_update( pub fn push_schema_update(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
schema: Schema, schema: Schema,

View File

@ -0,0 +1,117 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::automaton::normalize_str;
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsAddition {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
stop_words: BTreeSet<String>,
}
impl StopWordsAddition {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> StopWordsAddition {
StopWordsAddition {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_stop_words_addition(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_addition(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsAddition(addition);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_addition(
writer: &mut heed::RwTxn,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
addition: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in addition {
stop_words_builder.insert(&word).unwrap();
// we remove every posting list associated to a new stop word
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// we also need to remove all the stop words from the main fst
if let Some(word_fst) = main_store.words_fst(writer)? {
let op = OpBuilder::new()
.add(&word_fst)
.add(&delta_stop_words)
.difference();
let mut word_fst_builder = SetBuilder::memory();
word_fst_builder.extend_stream(op).unwrap();
let word_fst = word_fst_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_words_fst(writer, &word_fst)?;
}
// now we add all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.r#union();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
Ok(())
}

View File

@ -0,0 +1,113 @@
use std::collections::BTreeSet;
use fst::{set::OpBuilder, SetBuilder};
use crate::automaton::normalize_str;
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::update::documents_addition::reindex_all_documents;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct StopWordsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
stop_words: BTreeSet<String>,
}
impl StopWordsDeletion {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> StopWordsDeletion {
StopWordsDeletion {
updates_store,
updates_results_store,
updates_notifier,
stop_words: BTreeSet::new(),
}
}
pub fn delete_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
let stop_word = normalize_str(stop_word.as_ref());
self.stop_words.insert(stop_word);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_stop_words_deletion(
writer,
self.updates_store,
self.updates_results_store,
self.stop_words,
)?;
Ok(update_id)
}
}
pub fn push_stop_words_deletion(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeSet<String>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsDeletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_stop_words_deletion(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
deletion: BTreeSet<String>,
) -> MResult<()> {
let mut stop_words_builder = SetBuilder::memory();
for word in deletion {
stop_words_builder.insert(&word).unwrap();
}
// create the new delta stop words fst
let delta_stop_words = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
// now we delete all of these stop words from the main store
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
let op = OpBuilder::new()
.add(&stop_words_fst)
.add(&delta_stop_words)
.difference();
let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap();
let stop_words_fst = stop_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
// now that we have setup the stop words
// lets reindex everything...
reindex_all_documents(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
)?;
Ok(())
}

View File

@ -4,13 +4,14 @@ use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf; use sdset::SetBuf;
use crate::automaton::normalize_str; use crate::automaton::normalize_str;
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::update::{next_update_id, Update}; use crate::update::{next_update_id, Update};
use crate::{store, MResult}; use crate::{store, MResult};
pub struct SynonymsAddition { pub struct SynonymsAddition {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
synonyms: BTreeMap<String, Vec<String>>, synonyms: BTreeMap<String, Vec<String>>,
} }
@ -18,7 +19,7 @@ impl SynonymsAddition {
pub fn new( pub fn new(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
) -> SynonymsAddition { ) -> SynonymsAddition {
SynonymsAddition { SynonymsAddition {
updates_store, updates_store,
@ -42,8 +43,8 @@ impl SynonymsAddition {
.extend(alternatives); .extend(alternatives);
} }
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> { pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(()); let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_synonyms_addition( let update_id = push_synonyms_addition(
writer, writer,
self.updates_store, self.updates_store,
@ -55,7 +56,7 @@ impl SynonymsAddition {
} }
pub fn push_synonyms_addition( pub fn push_synonyms_addition(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
addition: BTreeMap<String, Vec<String>>, addition: BTreeMap<String, Vec<String>>,
@ -69,7 +70,7 @@ pub fn push_synonyms_addition(
} }
pub fn apply_synonyms_addition( pub fn apply_synonyms_addition(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
main_store: store::Main, main_store: store::Main,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
addition: BTreeMap<String, Vec<String>>, addition: BTreeMap<String, Vec<String>>,

View File

@ -5,13 +5,14 @@ use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf; use sdset::SetBuf;
use crate::automaton::normalize_str; use crate::automaton::normalize_str;
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::update::{next_update_id, Update}; use crate::update::{next_update_id, Update};
use crate::{store, MResult}; use crate::{store, MResult};
pub struct SynonymsDeletion { pub struct SynonymsDeletion {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
synonyms: BTreeMap<String, Option<Vec<String>>>, synonyms: BTreeMap<String, Option<Vec<String>>>,
} }
@ -19,7 +20,7 @@ impl SynonymsDeletion {
pub fn new( pub fn new(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: UpdateEventsEmitter,
) -> SynonymsDeletion { ) -> SynonymsDeletion {
SynonymsDeletion { SynonymsDeletion {
updates_store, updates_store,
@ -49,8 +50,8 @@ impl SynonymsDeletion {
} }
} }
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> { pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(()); let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_synonyms_deletion( let update_id = push_synonyms_deletion(
writer, writer,
self.updates_store, self.updates_store,
@ -62,7 +63,7 @@ impl SynonymsDeletion {
} }
pub fn push_synonyms_deletion( pub fn push_synonyms_deletion(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
deletion: BTreeMap<String, Option<Vec<String>>>, deletion: BTreeMap<String, Option<Vec<String>>>,
@ -76,7 +77,7 @@ pub fn push_synonyms_deletion(
} }
pub fn apply_synonyms_deletion( pub fn apply_synonyms_deletion(
writer: &mut zlmdb::RwTxn, writer: &mut heed::RwTxn,
main_store: store::Main, main_store: store::Main,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
deletion: BTreeMap<String, Option<Vec<String>>>, deletion: BTreeMap<String, Option<Vec<String>>>,

55
meilidb-http/Cargo.toml Normal file
View File

@ -0,0 +1,55 @@
[package]
name = "meilidb-http"
version = "0.1.1"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",
]
edition = "2018"
[dependencies]
bincode = "1.2.0"
chrono = { version = "0.4.9", features = ["serde"] }
crossbeam-channel = "0.3.9"
envconfig = "0.5.1"
envconfig_derive = "0.5.1"
heed = "0.5.0"
http = "0.1.19"
indexmap = { version = "1.3.0", features = ["serde-1"] }
jemallocator = "0.3.2"
log = "0.4.8"
main_error = "0.1.0"
meilidb-core = { path = "../meilidb-core", version = "0.6.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.6.0" }
pretty-bytes = "0.2.2"
rand = "0.7.2"
rayon = "1.2.0"
serde = { version = "1.0.101", features = ["derive"] }
serde_json = { version = "1.0.41", features = ["preserve_order"] }
structopt = "0.3.3"
sysinfo = "0.9.5"
walkdir = "2.2.9"
[dependencies.async-compression]
default-features = false
features = ["stream", "gzip", "zlib", "brotli", "zstd"]
version = "0.1.0-alpha.7"
[dependencies.tide]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[dependencies.tide-log]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[dependencies.tide-slog]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[dependencies.tide-compression]
git = "https://github.com/rustasync/tide"
rev = "e77709370bb24cf776fe6da902467c35131535b1"
[build-dependencies]
vergen = "3.0.4"

10
meilidb-http/build.rs Normal file
View File

@ -0,0 +1,10 @@
use vergen::{generate_cargo_keys, ConstantsFlags};
fn main() {
// Setup the flags, toggling off the 'SEMVER_FROM_CARGO_PKG' flag
let mut flags = ConstantsFlags::all();
flags.toggle(ConstantsFlags::SEMVER_FROM_CARGO_PKG);
// Generate the 'cargo:' key output
generate_cargo_keys(ConstantsFlags::all()).expect("Unable to generate the cargo keys!");
}

190
meilidb-http/src/data.rs Normal file
View File

@ -0,0 +1,190 @@
use std::collections::HashMap;
use std::ops::Deref;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use chrono::{DateTime, Utc};
use heed::types::{SerdeBincode, Str};
use log::*;
use meilidb_core::{Database, MResult};
use sysinfo::Pid;
use crate::option::Opt;
use crate::routes::index::index_update_callback;
pub type FreqsMap = HashMap<String, usize>;
type SerdeFreqsMap = SerdeBincode<FreqsMap>;
type SerdeDatetime = SerdeBincode<DateTime<Utc>>;
#[derive(Clone)]
pub struct Data {
inner: Arc<DataInner>,
}
impl Deref for Data {
type Target = DataInner;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
#[derive(Clone)]
pub struct DataInner {
pub db: Arc<Database>,
pub db_path: String,
pub admin_token: Option<String>,
pub server_pid: Pid,
pub accept_updates: Arc<AtomicBool>,
}
impl DataInner {
pub fn is_indexing(&self, reader: &heed::RoTxn, index: &str) -> MResult<Option<bool>> {
match self.db.open_index(&index) {
Some(index) => index.current_update_id(&reader).map(|u| Some(u.is_some())),
None => Ok(None),
}
}
pub fn last_update(
&self,
reader: &heed::RoTxn,
index_name: &str,
) -> MResult<Option<DateTime<Utc>>> {
let key = format!("last-update-{}", index_name);
match self
.db
.common_store()
.get::<Str, SerdeDatetime>(&reader, &key)?
{
Some(datetime) => Ok(Some(datetime)),
None => Ok(None),
}
}
pub fn set_last_update(&self, writer: &mut heed::RwTxn, index_name: &str) -> MResult<()> {
let key = format!("last-update-{}", index_name);
self.db
.common_store()
.put::<Str, SerdeDatetime>(writer, &key, &Utc::now())
.map_err(Into::into)
}
pub fn last_backup(&self, reader: &heed::RoTxn) -> MResult<Option<DateTime<Utc>>> {
match self
.db
.common_store()
.get::<Str, SerdeDatetime>(&reader, "last-backup")?
{
Some(datetime) => Ok(Some(datetime)),
None => Ok(None),
}
}
pub fn set_last_backup(&self, writer: &mut heed::RwTxn) -> MResult<()> {
self.db
.common_store()
.put::<Str, SerdeDatetime>(writer, "last-backup", &Utc::now())?;
Ok(())
}
pub fn fields_frequency(
&self,
reader: &heed::RoTxn,
index_name: &str,
) -> MResult<Option<FreqsMap>> {
let key = format!("fields-frequency-{}", index_name);
match self
.db
.common_store()
.get::<Str, SerdeFreqsMap>(&reader, &key)?
{
Some(freqs) => Ok(Some(freqs)),
None => Ok(None),
}
}
pub fn compute_stats(&self, writer: &mut heed::RwTxn, index_name: &str) -> MResult<()> {
let index = match self.db.open_index(&index_name) {
Some(index) => index,
None => {
error!("Impossible to retrieve index {}", index_name);
return Ok(());
}
};
let schema = match index.main.schema(&writer)? {
Some(schema) => schema,
None => return Ok(()),
};
let all_documents_fields = index
.documents_fields_counts
.all_documents_fields_counts(&writer)?;
// count fields frequencies
let mut fields_frequency = HashMap::<_, usize>::new();
for result in all_documents_fields {
let (_, attr, _) = result?;
*fields_frequency.entry(attr).or_default() += 1;
}
// convert attributes to their names
let frequency: HashMap<_, _> = fields_frequency
.into_iter()
.map(|(a, c)| (schema.attribute_name(a).to_owned(), c))
.collect();
let key = format!("fields-frequency-{}", index_name);
self.db
.common_store()
.put::<Str, SerdeFreqsMap>(writer, &key, &frequency)?;
Ok(())
}
pub fn stop_accept_updates(&self) {
self.accept_updates.store(false, Ordering::Relaxed);
}
pub fn accept_updates(&self) -> bool {
self.accept_updates.load(Ordering::Relaxed)
}
}
impl Data {
pub fn new(opt: Opt) -> Data {
let db_path = opt.database_path.clone();
let admin_token = opt.admin_token.clone();
let server_pid = sysinfo::get_current_pid().unwrap();
let db = Arc::new(Database::open_or_create(opt.database_path.clone()).unwrap());
let accept_updates = Arc::new(AtomicBool::new(true));
let inner_data = DataInner {
db: db.clone(),
db_path,
admin_token,
server_pid,
accept_updates,
};
let data = Data {
inner: Arc::new(inner_data),
};
for index_name in db.indexes_names().unwrap() {
let callback_context = data.clone();
let callback_name = index_name.clone();
db.set_update_callback(
index_name,
Box::new(move |status| {
index_update_callback(&callback_name, &callback_context, status);
}),
);
}
data
}
}

117
meilidb-http/src/error.rs Normal file
View File

@ -0,0 +1,117 @@
use std::fmt::Display;
use http::status::StatusCode;
use log::{error, warn};
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::Response;
pub type SResult<T> = Result<T, ResponseError>;
pub enum ResponseError {
Internal(String),
BadRequest(String),
InvalidToken(String),
NotFound(String),
IndexNotFound(String),
DocumentNotFound(String),
MissingHeader(String),
BadParameter(String, String),
CreateIndex(String),
Maintenance,
}
impl ResponseError {
pub fn internal(message: impl Display) -> ResponseError {
ResponseError::Internal(message.to_string())
}
pub fn bad_request(message: impl Display) -> ResponseError {
ResponseError::BadRequest(message.to_string())
}
pub fn invalid_token(message: impl Display) -> ResponseError {
ResponseError::InvalidToken(message.to_string())
}
pub fn not_found(message: impl Display) -> ResponseError {
ResponseError::NotFound(message.to_string())
}
pub fn index_not_found(message: impl Display) -> ResponseError {
ResponseError::IndexNotFound(message.to_string())
}
pub fn document_not_found(message: impl Display) -> ResponseError {
ResponseError::DocumentNotFound(message.to_string())
}
pub fn missing_header(message: impl Display) -> ResponseError {
ResponseError::MissingHeader(message.to_string())
}
pub fn bad_parameter(name: impl Display, message: impl Display) -> ResponseError {
ResponseError::BadParameter(name.to_string(), message.to_string())
}
pub fn create_index(message: impl Display) -> ResponseError {
ResponseError::CreateIndex(message.to_string())
}
}
impl IntoResponse for ResponseError {
fn into_response(self) -> Response {
match self {
ResponseError::Internal(err) => {
error!("internal server error: {}", err);
error(
String::from("Internal server error"),
StatusCode::INTERNAL_SERVER_ERROR,
)
}
ResponseError::BadRequest(err) => {
warn!("bad request: {}", err);
error(err, StatusCode::BAD_REQUEST)
}
ResponseError::InvalidToken(err) => {
error(format!("Invalid Token: {}", err), StatusCode::FORBIDDEN)
}
ResponseError::NotFound(err) => error(err, StatusCode::NOT_FOUND),
ResponseError::IndexNotFound(index) => {
error(format!("Index {} not found", index), StatusCode::NOT_FOUND)
}
ResponseError::DocumentNotFound(id) => error(
format!("Document with id {} not found", id),
StatusCode::NOT_FOUND,
),
ResponseError::MissingHeader(header) => error(
format!("Header {} is missing", header),
StatusCode::UNAUTHORIZED,
),
ResponseError::BadParameter(param, e) => error(
format!("Url parameter {} error: {}", param, e),
StatusCode::BAD_REQUEST,
),
ResponseError::CreateIndex(err) => error(
format!("Impossible to create index; {}", err),
StatusCode::BAD_REQUEST,
),
ResponseError::Maintenance => error(
String::from("Server is in maintenance, please try again later"),
StatusCode::SERVICE_UNAVAILABLE,
),
}
}
}
#[derive(Serialize, Deserialize)]
struct ErrorMessage {
message: String,
}
fn error(message: String, status: StatusCode) -> Response {
let message = ErrorMessage { message };
tide::response::json(message)
.with_status(status)
.into_response()
}

View File

@ -0,0 +1,568 @@
use crate::routes::setting::{RankingOrdering, SettingBody};
use indexmap::IndexMap;
use log::*;
use meilidb_core::criterion::*;
use meilidb_core::Highlight;
use meilidb_core::{Index, RankedMap};
use meilidb_schema::{Schema, SchemaAttr};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::convert::From;
use std::error;
use std::fmt;
use std::time::{Duration, Instant};
#[derive(Debug)]
pub enum Error {
SearchDocuments(String),
RetrieveDocument(u64, String),
DocumentNotFound(u64),
CropFieldWrongType(String),
AttributeNotFoundOnDocument(String),
AttributeNotFoundOnSchema(String),
MissingFilterValue,
UnknownFilteredAttribute,
Internal(String),
}
impl error::Error for Error {}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use Error::*;
match self {
SearchDocuments(err) => write!(f, "impossible to search documents; {}", err),
RetrieveDocument(id, err) => write!(
f,
"impossible to retrieve the document with id: {}; {}",
id, err
),
DocumentNotFound(id) => write!(f, "document {} not found", id),
CropFieldWrongType(field) => {
write!(f, "the field {} cannot be cropped it's not a string", field)
}
AttributeNotFoundOnDocument(field) => {
write!(f, "field {} is not found on document", field)
}
AttributeNotFoundOnSchema(field) => write!(f, "field {} is not found on schema", field),
MissingFilterValue => f.write_str("a filter doesn't have a value to compare it with"),
UnknownFilteredAttribute => {
f.write_str("a filter is specifying an unknown schema attribute")
}
Internal(err) => write!(f, "internal error; {}", err),
}
}
}
impl From<meilidb_core::Error> for Error {
fn from(error: meilidb_core::Error) -> Self {
Error::Internal(error.to_string())
}
}
pub trait IndexSearchExt {
fn new_search(&self, query: String) -> SearchBuilder;
}
impl IndexSearchExt for Index {
fn new_search(&self, query: String) -> SearchBuilder {
SearchBuilder {
index: self,
query,
offset: 0,
limit: 20,
attributes_to_crop: None,
attributes_to_retrieve: None,
attributes_to_search_in: None,
attributes_to_highlight: None,
filters: None,
timeout: Duration::from_millis(30),
matches: false,
}
}
}
pub struct SearchBuilder<'a> {
index: &'a Index,
query: String,
offset: usize,
limit: usize,
attributes_to_crop: Option<HashMap<String, usize>>,
attributes_to_retrieve: Option<HashSet<String>>,
attributes_to_search_in: Option<HashSet<String>>,
attributes_to_highlight: Option<HashSet<String>>,
filters: Option<String>,
timeout: Duration,
matches: bool,
}
impl<'a> SearchBuilder<'a> {
pub fn offset(&mut self, value: usize) -> &SearchBuilder {
self.offset = value;
self
}
pub fn limit(&mut self, value: usize) -> &SearchBuilder {
self.limit = value;
self
}
pub fn attributes_to_crop(&mut self, value: HashMap<String, usize>) -> &SearchBuilder {
self.attributes_to_crop = Some(value);
self
}
pub fn attributes_to_retrieve(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_retrieve = Some(value);
self
}
pub fn add_retrievable_field(&mut self, value: String) -> &SearchBuilder {
let attributes_to_retrieve = self.attributes_to_retrieve.get_or_insert(HashSet::new());
attributes_to_retrieve.insert(value);
self
}
pub fn attributes_to_search_in(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_search_in = Some(value);
self
}
pub fn attributes_to_highlight(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_highlight = Some(value);
self
}
pub fn filters(&mut self, value: String) -> &SearchBuilder {
self.filters = Some(value);
self
}
pub fn timeout(&mut self, value: Duration) -> &SearchBuilder {
self.timeout = value;
self
}
pub fn get_matches(&mut self) -> &SearchBuilder {
self.matches = true;
self
}
pub fn search(&self, reader: &heed::RoTxn) -> Result<SearchResult, Error> {
let schema = self.index.main.schema(reader);
let schema = schema.map_err(|e| Error::Internal(e.to_string()))?;
let schema = match schema {
Some(schema) => schema,
None => return Err(Error::Internal(String::from("missing schema"))),
};
let ranked_map = self.index.main.ranked_map(reader);
let ranked_map = ranked_map.map_err(|e| Error::Internal(e.to_string()))?;
let ranked_map = ranked_map.unwrap_or_default();
let start = Instant::now();
// Change criteria
let mut query_builder = match self.get_criteria(reader, &ranked_map, &schema)? {
Some(criteria) => self.index.query_builder_with_criteria(criteria),
None => self.index.query_builder(),
};
// Filter searchable fields
if let Some(fields) = &self.attributes_to_search_in {
for attribute in fields.iter().filter_map(|f| schema.attribute(f)) {
query_builder.add_searchable_attribute(attribute.0);
}
}
if let Some(filters) = &self.filters {
let mut split = filters.split(':');
match (split.next(), split.next()) {
(Some(_), None) | (Some(_), Some("")) => return Err(Error::MissingFilterValue),
(Some(attr), Some(value)) => {
let ref_reader = reader;
let ref_index = &self.index;
let value = value.trim().to_lowercase();
let attr = match schema.attribute(attr) {
Some(attr) => attr,
None => return Err(Error::UnknownFilteredAttribute),
};
query_builder.with_filter(move |id| {
let attr = attr;
let index = ref_index;
let reader = ref_reader;
match index.document_attribute::<Value>(reader, id, attr) {
Ok(Some(Value::String(s))) => s.to_lowercase() == value,
Ok(Some(Value::Bool(b))) => {
(value == "true" && b) || (value == "false" && !b)
}
Ok(Some(Value::Array(a))) => {
a.into_iter().any(|s| s.as_str() == Some(&value))
}
_ => false,
}
});
}
(_, _) => (),
}
}
query_builder.with_fetch_timeout(self.timeout);
let docs =
query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
let mut hits = Vec::with_capacity(self.limit);
for doc in docs.map_err(|e| Error::SearchDocuments(e.to_string()))? {
// retrieve the content of document in kv store
let mut fields: Option<HashSet<&str>> = None;
if let Some(attributes_to_retrieve) = &self.attributes_to_retrieve {
let mut set = HashSet::new();
for field in attributes_to_retrieve {
set.insert(field.as_str());
}
fields = Some(set);
}
let mut document: IndexMap<String, Value> = self
.index
.document(reader, fields.as_ref(), doc.id)
.map_err(|e| Error::RetrieveDocument(doc.id.0, e.to_string()))?
.ok_or(Error::DocumentNotFound(doc.id.0))?;
let mut matches = doc.highlights.clone();
// Crops fields if needed
if let Some(fields) = self.attributes_to_crop.clone() {
for (field, length) in fields {
let _ = crop_document(&mut document, &mut matches, &schema, &field, length);
}
}
// Transform to readable matches
let matches = calculate_matches(matches, self.attributes_to_retrieve.clone(), &schema);
if !self.matches {
if let Some(attributes_to_highlight) = self.attributes_to_highlight.clone() {
let highlights = calculate_highlights(
document.clone(),
matches.clone(),
attributes_to_highlight,
);
for (key, value) in highlights {
if let Some(content) = document.get_mut(&key) {
*content = value;
}
}
}
}
let matches_info = if self.matches { Some(matches) } else { None };
let hit = SearchHit {
hit: document,
matches_info,
};
hits.push(hit);
}
let time_ms = start.elapsed().as_millis() as usize;
let results = SearchResult {
hits,
offset: self.offset,
limit: self.limit,
processing_time_ms: time_ms,
query: self.query.to_string(),
};
Ok(results)
}
pub fn get_criteria(
&self,
reader: &heed::RoTxn,
ranked_map: &'a RankedMap,
schema: &Schema,
) -> Result<Option<Criteria<'a>>, Error> {
let current_settings = match self.index.main.customs(reader).unwrap() {
Some(bytes) => bincode::deserialize(bytes).unwrap(),
None => SettingBody::default(),
};
let ranking_rules = &current_settings.ranking_rules;
let ranking_order = &current_settings.ranking_order;
if let Some(ranking_rules) = ranking_rules {
let mut builder = CriteriaBuilder::with_capacity(7 + ranking_rules.len());
if let Some(ranking_rules_order) = ranking_order {
for rule in ranking_rules_order {
match rule.as_str() {
"_sum_of_typos" => builder.push(SumOfTypos),
"_number_of_words" => builder.push(NumberOfWords),
"_word_proximity" => builder.push(WordsProximity),
"_sum_of_words_attribute" => builder.push(SumOfWordsAttribute),
"_sum_of_words_position" => builder.push(SumOfWordsPosition),
"_exact" => builder.push(Exact),
_ => {
let order = match ranking_rules.get(rule.as_str()) {
Some(o) => o,
None => continue,
};
let custom_ranking = match order {
RankingOrdering::Asc => {
SortByAttr::lower_is_better(&ranked_map, &schema, &rule)
.unwrap()
}
RankingOrdering::Dsc => {
SortByAttr::higher_is_better(&ranked_map, &schema, &rule)
.unwrap()
}
};
builder.push(custom_ranking);
}
}
}
builder.push(DocumentId);
return Ok(Some(builder.build()));
} else {
builder.push(SumOfTypos);
builder.push(NumberOfWords);
builder.push(WordsProximity);
builder.push(SumOfWordsAttribute);
builder.push(SumOfWordsPosition);
builder.push(Exact);
for (rule, order) in ranking_rules.iter() {
let custom_ranking = match order {
RankingOrdering::Asc => {
SortByAttr::lower_is_better(&ranked_map, &schema, &rule).unwrap()
}
RankingOrdering::Dsc => {
SortByAttr::higher_is_better(&ranked_map, &schema, &rule).unwrap()
}
};
builder.push(custom_ranking);
}
builder.push(DocumentId);
return Ok(Some(builder.build()));
}
}
Ok(None)
}
}
#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Serialize, Deserialize)]
pub struct MatchPosition {
pub start: usize,
pub length: usize,
}
impl Ord for MatchPosition {
fn cmp(&self, other: &Self) -> Ordering {
match self.start.cmp(&other.start) {
Ordering::Equal => self.length.cmp(&other.length),
_ => self.start.cmp(&other.start),
}
}
}
pub type HighlightInfos = HashMap<String, Value>;
pub type MatchesInfos = HashMap<String, Vec<MatchPosition>>;
// pub type RankingInfos = HashMap<String, u64>;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchHit {
#[serde(flatten)]
pub hit: IndexMap<String, Value>,
#[serde(rename = "_matchesInfo", skip_serializing_if = "Option::is_none")]
pub matches_info: Option<MatchesInfos>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub hits: Vec<SearchHit>,
pub offset: usize,
pub limit: usize,
pub processing_time_ms: usize,
pub query: String,
// pub parsed_query: String,
// pub params: Option<String>,
}
fn crop_text(
text: &str,
matches: impl IntoIterator<Item = Highlight>,
context: usize,
) -> (String, Vec<Highlight>) {
let mut matches = matches.into_iter().peekable();
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let matches = matches
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
.map(|match_| Highlight {
char_index: match_.char_index - start as u16,
..match_
})
.collect();
(text, matches)
}
fn crop_document(
document: &mut IndexMap<String, Value>,
matches: &mut Vec<Highlight>,
schema: &Schema,
field: &str,
length: usize,
) -> Result<(), Error> {
matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let attribute = schema
.attribute(field)
.ok_or(Error::AttributeNotFoundOnSchema(field.to_string()))?;
let selected_matches = matches
.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attribute)
.cloned();
let original_text = match document.get(field) {
Some(Value::String(text)) => text,
Some(_) => return Err(Error::CropFieldWrongType(field.to_string())),
None => return Err(Error::AttributeNotFoundOnDocument(field.to_string())),
};
let (cropped_text, cropped_matches) = crop_text(&original_text, selected_matches, length);
document.insert(
field.to_string(),
serde_json::value::Value::String(cropped_text),
);
matches.retain(|m| SchemaAttr::new(m.attribute) != attribute);
matches.extend_from_slice(&cropped_matches);
Ok(())
}
fn calculate_matches(
matches: Vec<Highlight>,
attributes_to_retrieve: Option<HashSet<String>>,
schema: &Schema,
) -> MatchesInfos {
let mut matches_result: HashMap<String, Vec<MatchPosition>> = HashMap::new();
for m in matches.iter() {
let attribute = schema
.attribute_name(SchemaAttr::new(m.attribute))
.to_string();
if let Some(attributes_to_retrieve) = attributes_to_retrieve.clone() {
if !attributes_to_retrieve.contains(attribute.as_str()) {
continue;
}
};
if let Some(pos) = matches_result.get_mut(&attribute) {
pos.push(MatchPosition {
start: m.char_index as usize,
length: m.char_length as usize,
});
} else {
let mut positions = Vec::new();
positions.push(MatchPosition {
start: m.char_index as usize,
length: m.char_length as usize,
});
matches_result.insert(attribute, positions);
}
}
for (_, val) in matches_result.iter_mut() {
val.sort_unstable();
val.dedup();
}
matches_result
}
fn calculate_highlights(
document: IndexMap<String, Value>,
matches: MatchesInfos,
attributes_to_highlight: HashSet<String>,
) -> HighlightInfos {
let mut highlight_result: HashMap<String, Value> = HashMap::new();
for (attribute, matches) in matches.iter() {
if attributes_to_highlight.contains("*") || attributes_to_highlight.contains(attribute) {
if let Some(Value::String(value)) = document.get(attribute) {
let value: Vec<_> = value.chars().collect();
let mut highlighted_value = String::new();
let mut index = 0;
for m in matches {
if m.start >= index {
let before = value.get(index..m.start);
let highlighted = value.get(m.start..(m.start + m.length));
if let (Some(before), Some(highlighted)) = (before, highlighted) {
highlighted_value.extend(before);
highlighted_value.push_str("<em>");
highlighted_value.extend(highlighted);
highlighted_value.push_str("</em>");
index = m.start + m.length;
} else {
error!("value: {:?}; index: {:?}, match: {:?}", value, index, m);
}
}
}
highlighted_value.extend(value[index..].iter());
highlight_result.insert(attribute.to_string(), Value::String(highlighted_value));
};
}
}
highlight_result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn calculate_highlights() {
let data = r#"{
"title": "Fondation (Isaac ASIMOV)",
"description": "En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la Fondation."
}"#;
let document: IndexMap<String, Value> = serde_json::from_str(data).unwrap();
let mut attributes_to_highlight = HashSet::new();
attributes_to_highlight.insert("*".to_string());
let mut matches: HashMap<String, Vec<MatchPosition>> = HashMap::new();
let mut m = Vec::new();
m.push(MatchPosition {
start: 0,
length: 9,
});
matches.insert("title".to_string(), m);
let mut m = Vec::new();
m.push(MatchPosition {
start: 510,
length: 9,
});
matches.insert("description".to_string(), m);
let result = super::calculate_highlights(document, matches, attributes_to_highlight);
let mut result_expected = HashMap::new();
result_expected.insert(
"title".to_string(),
Value::String("<em>Fondation</em> (Isaac ASIMOV)".to_string()),
);
result_expected.insert("description".to_string(), Value::String("En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la <em>Fondation</em>.".to_string()));
assert_eq!(result, result_expected);
}
}

View File

@ -0,0 +1,2 @@
pub mod meilidb;
pub mod tide;

View File

@ -0,0 +1,118 @@
use crate::error::{ResponseError, SResult};
use crate::models::token::*;
use crate::Data;
use chrono::Utc;
use heed::types::{SerdeBincode, Str};
use meilidb_core::Index;
use serde_json::Value;
use tide::Context;
pub trait ContextExt {
fn is_allowed(&self, acl: ACL) -> SResult<()>;
fn header(&self, name: &str) -> Result<String, ResponseError>;
fn url_param(&self, name: &str) -> Result<String, ResponseError>;
fn index(&self) -> Result<Index, ResponseError>;
fn identifier(&self) -> Result<String, ResponseError>;
}
impl ContextExt for Context<Data> {
fn is_allowed(&self, acl: ACL) -> SResult<()> {
let admin_token = match &self.state().admin_token {
Some(admin_token) => admin_token,
None => return Ok(()),
};
let user_api_key = self.header("X-Meili-API-Key")?;
if user_api_key == *admin_token {
return Ok(());
}
let request_index: Option<String> = None; //self.param::<String>("index").ok();
let db = &self.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, user_api_key);
let token_config = db
.common_store()
.get::<Str, SerdeBincode<Token>>(&reader, &token_key)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found(format!(
"token key: {}",
token_key
)))?;
if token_config.revoked {
return Err(ResponseError::invalid_token("token revoked"));
}
if let Some(index) = request_index {
if !token_config
.indexes
.iter()
.any(|r| match_wildcard(&r, &index))
{
return Err(ResponseError::invalid_token(
"token is not allowed to access to this index",
));
}
}
if token_config.expires_at < Utc::now() {
return Err(ResponseError::invalid_token("token expired"));
}
if token_config.acl.contains(&ACL::All) {
return Ok(());
}
if !token_config.acl.contains(&acl) {
return Err(ResponseError::invalid_token("token do not have this ACL"));
}
Ok(())
}
fn header(&self, name: &str) -> Result<String, ResponseError> {
let header = self
.headers()
.get(name)
.ok_or(ResponseError::missing_header(name))?
.to_str()
.map_err(|_| ResponseError::missing_header("X-Meili-API-Key"))?
.to_string();
Ok(header)
}
fn url_param(&self, name: &str) -> Result<String, ResponseError> {
let param = self
.param::<String>(name)
.map_err(|e| ResponseError::bad_parameter(name, e))?;
Ok(param)
}
fn index(&self) -> Result<Index, ResponseError> {
let index_name = self.url_param("index")?;
let index = self
.state()
.db
.open_index(&index_name)
.ok_or(ResponseError::index_not_found(index_name))?;
Ok(index)
}
fn identifier(&self) -> Result<String, ResponseError> {
let name = self
.param::<Value>("identifier")
.as_ref()
.map(meilidb_core::serde::value_to_string)
.map_err(|e| ResponseError::bad_parameter("identifier", e))?
.ok_or(ResponseError::bad_parameter(
"identifier",
"missing parameter",
))?;
Ok(name)
}
}

11
meilidb-http/src/lib.rs Normal file
View File

@ -0,0 +1,11 @@
#[macro_use]
extern crate envconfig_derive;
pub mod data;
pub mod error;
pub mod helpers;
pub mod models;
pub mod option;
pub mod routes;
use self::data::Data;

36
meilidb-http/src/main.rs Normal file
View File

@ -0,0 +1,36 @@
use http::header::HeaderValue;
use log::info;
use main_error::MainError;
use tide::middleware::{CorsMiddleware, CorsOrigin};
use tide_log::RequestLogger;
use meilidb_http::data::Data;
use meilidb_http::option::Opt;
use meilidb_http::routes;
#[cfg(not(target_os = "macos"))]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
pub fn main() -> Result<(), MainError> {
let opt = Opt::new();
let data = Data::new(opt.clone());
let mut app = tide::App::with_state(data);
app.middleware(
CorsMiddleware::new()
.allow_origin(CorsOrigin::from("*"))
.allow_methods(HeaderValue::from_static("GET, POST, OPTIONS")),
);
app.middleware(RequestLogger::new());
app.middleware(tide_compression::Compression::new());
app.middleware(tide_compression::Decompression::new());
routes::load_routes(&mut app);
info!("Server HTTP enabled");
app.run(opt.http_addr)?;
Ok(())
}

View File

@ -0,0 +1,3 @@
pub mod schema;
pub mod token;
pub mod update_operation;

View File

@ -0,0 +1,118 @@
use std::collections::HashSet;
use indexmap::IndexMap;
use meilidb_schema::{Schema, SchemaBuilder, SchemaProps};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum FieldProperties {
Identifier,
Indexed,
Displayed,
Ranked,
}
#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub struct SchemaBody(IndexMap<String, HashSet<FieldProperties>>);
impl From<Schema> for SchemaBody {
fn from(value: Schema) -> SchemaBody {
let mut map = IndexMap::new();
for (name, _attr, props) in value.iter() {
let old_properties = map.entry(name.to_owned()).or_insert(HashSet::new());
if props.is_indexed() {
old_properties.insert(FieldProperties::Indexed);
}
if props.is_displayed() {
old_properties.insert(FieldProperties::Displayed);
}
if props.is_ranked() {
old_properties.insert(FieldProperties::Ranked);
}
}
let old_properties = map
.entry(value.identifier_name().to_string())
.or_insert(HashSet::new());
old_properties.insert(FieldProperties::Identifier);
old_properties.insert(FieldProperties::Displayed);
SchemaBody(map)
}
}
impl Into<Schema> for SchemaBody {
fn into(self) -> Schema {
let mut identifier = "documentId".to_string();
let mut attributes = IndexMap::new();
for (field, properties) in self.0 {
let mut indexed = false;
let mut displayed = false;
let mut ranked = false;
for property in properties {
match property {
FieldProperties::Indexed => indexed = true,
FieldProperties::Displayed => displayed = true,
FieldProperties::Ranked => ranked = true,
FieldProperties::Identifier => identifier = field.clone(),
}
}
attributes.insert(
field,
SchemaProps {
indexed,
displayed,
ranked,
},
);
}
let mut builder = SchemaBuilder::with_identifier(identifier);
for (field, props) in attributes {
builder.new_attribute(field, props);
}
builder.build()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_schema_body_conversion() {
let schema_body = r#"
{
"id": ["identifier", "indexed", "displayed"],
"title": ["indexed", "displayed"],
"date": ["displayed"]
}
"#;
let schema_builder = r#"
{
"identifier": "id",
"attributes": {
"id": {
"indexed": true,
"displayed": true
},
"title": {
"indexed": true,
"displayed": true
},
"date": {
"displayed": true
}
}
}
"#;
let schema_body: SchemaBody = serde_json::from_str(schema_body).unwrap();
let schema_builder: SchemaBuilder = serde_json::from_str(schema_builder).unwrap();
let schema_from_body: Schema = schema_body.into();
let schema_from_builder: Schema = schema_builder.build();
assert_eq!(schema_from_body, schema_from_builder);
}
}

View File

@ -0,0 +1,72 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
pub const TOKEN_PREFIX_KEY: &str = "_token_";
#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum ACL {
IndexesRead,
IndexesWrite,
DocumentsRead,
DocumentsWrite,
SettingsRead,
SettingsWrite,
Admin,
#[serde(rename = "*")]
All,
}
pub type Wildcard = String;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Token {
pub key: String,
pub description: String,
pub acl: Vec<ACL>,
pub indexes: Vec<Wildcard>,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
pub expires_at: DateTime<Utc>,
pub revoked: bool,
}
fn cleanup_wildcard(input: &str) -> (bool, &str, bool) {
let first = input.chars().next().filter(|&c| c == '*').is_some();
let last = input.chars().last().filter(|&c| c == '*').is_some();
let bound_last = std::cmp::max(input.len().saturating_sub(last as usize), first as usize);
let output = input.get(first as usize..bound_last).unwrap();
(first, output, last)
}
pub fn match_wildcard(pattern: &str, input: &str) -> bool {
let (first, pattern, last) = cleanup_wildcard(pattern);
match (first, last) {
(false, false) => pattern == input,
(true, false) => input.ends_with(pattern),
(false, true) => input.starts_with(pattern),
(true, true) => input.contains(pattern),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_match_wildcard() {
assert!(match_wildcard("*", "qqq"));
assert!(match_wildcard("*", ""));
assert!(match_wildcard("*ab", "qqqab"));
assert!(match_wildcard("*ab*", "qqqabqq"));
assert!(match_wildcard("ab*", "abqqq"));
assert!(match_wildcard("**", "ab"));
assert!(match_wildcard("ab", "ab"));
assert!(match_wildcard("ab*", "ab"));
assert!(match_wildcard("*ab", "ab"));
assert!(match_wildcard("*ab*", "ab"));
assert!(match_wildcard("*😆*", "ab😆dsa"));
}
}

View File

@ -0,0 +1,33 @@
use std::fmt;
#[allow(dead_code)]
#[derive(Debug)]
pub enum UpdateOperation {
ClearAllDocuments,
DocumentsAddition,
DocumentsDeletion,
SynonymsAddition,
SynonymsDeletion,
StopWordsAddition,
StopWordsDeletion,
Schema,
Config,
}
impl fmt::Display for UpdateOperation {
fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
use UpdateOperation::*;
match self {
ClearAllDocuments => write!(f, "ClearAllDocuments"),
DocumentsAddition => write!(f, "DocumentsAddition"),
DocumentsDeletion => write!(f, "DocumentsDeletion"),
SynonymsAddition => write!(f, "SynonymsAddition"),
SynonymsDeletion => write!(f, "SynonymsDelettion"),
StopWordsAddition => write!(f, "StopWordsAddition"),
StopWordsDeletion => write!(f, "StopWordsDeletion"),
Schema => write!(f, "Schema"),
Config => write!(f, "Config"),
}
}
}

View File

@ -0,0 +1,56 @@
use envconfig::Envconfig;
use structopt::StructOpt;
#[derive(Debug, Clone, StructOpt, Envconfig)]
struct Vars {
/// The destination where the database must be created.
#[structopt(long)]
#[envconfig(from = "MEILI_DATABASE_PATH")]
pub database_path: Option<String>,
/// The addr on which the http server will listen.
#[structopt(long)]
#[envconfig(from = "MEILI_HTTP_ADDR")]
pub http_addr: Option<String>,
#[structopt(long)]
#[envconfig(from = "MEILI_ADMIN_TOKEN")]
pub admin_token: Option<String>,
}
#[derive(Clone, Debug)]
pub struct Opt {
pub database_path: String,
pub http_addr: String,
pub admin_token: Option<String>,
}
impl Default for Opt {
fn default() -> Self {
Opt {
database_path: String::from("/tmp/meilidb"),
http_addr: String::from("127.0.0.1:8080"),
admin_token: None,
}
}
}
impl Opt {
pub fn new() -> Self {
let default = Self::default();
let args = Vars::from_args();
let env = Vars::init().unwrap();
Self {
database_path: env
.database_path
.or(args.database_path)
.unwrap_or(default.database_path),
http_addr: env
.http_addr
.or(args.http_addr)
.unwrap_or(default.http_addr),
admin_token: env.admin_token.or(args.admin_token).or(default.admin_token),
}
}
}

View File

@ -0,0 +1,262 @@
use std::collections::{BTreeSet, HashSet};
use http::StatusCode;
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use tide::querystring::ContextExt as QSContextExt;
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::Data;
pub async fn get_document(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsRead)?;
let index = ctx.index()?;
let identifier = ctx.identifier()?;
let document_id = meilidb_core::serde::compute_document_id(identifier.clone());
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let response = index
.document::<IndexMap<String, Value>>(&reader, None, document_id)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::document_not_found(&identifier))?;
if response.is_empty() {
return Err(ResponseError::document_not_found(identifier));
}
Ok(tide::response::json(response))
}
#[derive(Default, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct IndexUpdateResponse {
pub update_id: u64,
}
pub async fn delete_document(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let index = ctx.index()?;
let identifier = ctx.identifier()?;
let document_id = meilidb_core::serde::compute_document_id(identifier.clone());
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut documents_deletion = index.documents_deletion();
documents_deletion.delete_document_by_id(document_id);
let update_id = documents_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
#[derive(Default, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct BrowseQuery {
offset: Option<usize>,
limit: Option<usize>,
attributes_to_retrieve: Option<String>,
}
pub async fn browse_documents(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsRead)?;
let index = ctx.index()?;
let query: BrowseQuery = ctx.url_query().unwrap_or(BrowseQuery::default());
let offset = query.offset.unwrap_or(0);
let limit = query.limit.unwrap_or(20);
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let documents_ids: Result<BTreeSet<_>, _> =
match index.documents_fields_counts.documents_ids(&reader) {
Ok(documents_ids) => documents_ids.skip(offset).take(limit).collect(),
Err(e) => return Err(ResponseError::internal(e)),
};
let documents_ids = match documents_ids {
Ok(documents_ids) => documents_ids,
Err(e) => return Err(ResponseError::internal(e)),
};
let mut response_body = Vec::<IndexMap<String, Value>>::new();
if let Some(attributes) = query.attributes_to_retrieve {
let attributes = attributes.split(',').collect::<HashSet<&str>>();
for document_id in documents_ids {
if let Ok(Some(document)) = index.document(&reader, Some(&attributes), document_id) {
response_body.push(document);
}
}
} else {
for document_id in documents_ids {
if let Ok(Some(document)) = index.document(&reader, None, document_id) {
response_body.push(document);
}
}
}
if response_body.is_empty() {
Ok(tide::response::json(response_body)
.with_status(StatusCode::NO_CONTENT)
.into_response())
} else {
Ok(tide::response::json(response_body)
.with_status(StatusCode::OK)
.into_response())
}
}
fn infered_schema(document: &IndexMap<String, Value>) -> Option<meilidb_schema::Schema> {
use meilidb_schema::{SchemaBuilder, DISPLAYED, INDEXED};
let mut identifier = None;
for key in document.keys() {
if identifier.is_none() && key.to_lowercase().contains("id") {
identifier = Some(key);
}
}
match identifier {
Some(identifier) => {
let mut builder = SchemaBuilder::with_identifier(identifier);
for key in document.keys() {
builder.new_attribute(key, DISPLAYED | INDEXED);
}
Some(builder.build())
}
None => None,
}
}
async fn update_multiple_documents(mut ctx: Context<Data>, is_partial: bool) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let data: Vec<IndexMap<String, Value>> =
ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let current_schema = index
.main
.schema(&writer)
.map_err(ResponseError::internal)?;
if current_schema.is_none() {
match data.first().and_then(infered_schema) {
Some(schema) => {
index
.schema_update(&mut writer, schema)
.map_err(ResponseError::internal)?;
}
None => return Err(ResponseError::bad_request("Could not infer a schema")),
}
}
let mut document_addition = if is_partial {
index.documents_partial_addition()
} else {
index.documents_addition()
};
for document in data {
document_addition.update_document(document);
}
let update_id = document_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn add_or_replace_multiple_documents(ctx: Context<Data>) -> SResult<Response> {
update_multiple_documents(ctx, false).await
}
pub async fn add_or_update_multiple_documents(ctx: Context<Data>) -> SResult<Response> {
update_multiple_documents(ctx, true).await
}
pub async fn delete_multiple_documents(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let data: Vec<Value> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut documents_deletion = index.documents_deletion();
for identifier in data {
if let Some(identifier) = meilidb_core::serde::value_to_string(&identifier) {
documents_deletion
.delete_document_by_id(meilidb_core::serde::compute_document_id(identifier));
}
}
let update_id = documents_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn clear_all_documents(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let update_id = index
.clear_all(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -0,0 +1,79 @@
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::Data;
use heed::types::{Str, Unit};
use serde::Deserialize;
use tide::Context;
const UNHEALTHY_KEY: &str = "_is_unhealthy";
pub async fn get_health(ctx: Context<Data>) -> SResult<()> {
let db = &ctx.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let common_store = ctx.state().db.common_store();
if let Ok(Some(_)) = common_store.get::<Str, Unit>(&reader, UNHEALTHY_KEY) {
return Err(ResponseError::Maintenance);
}
Ok(())
}
pub async fn set_healthy(ctx: Context<Data>) -> SResult<()> {
ctx.is_allowed(Admin)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = ctx.state().db.common_store();
match common_store.delete::<Str>(&mut writer, UNHEALTHY_KEY) {
Ok(_) => (),
Err(e) => return Err(ResponseError::internal(e)),
}
if let Err(e) = writer.commit() {
return Err(ResponseError::internal(e));
}
Ok(())
}
pub async fn set_unhealthy(ctx: Context<Data>) -> SResult<()> {
ctx.is_allowed(Admin)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = ctx.state().db.common_store();
if let Err(e) = common_store.put::<Str, Unit>(&mut writer, UNHEALTHY_KEY, &()) {
return Err(ResponseError::internal(e));
}
if let Err(e) = writer.commit() {
return Err(ResponseError::internal(e));
}
Ok(())
}
#[derive(Deserialize, Clone)]
struct HealtBody {
health: bool,
}
pub async fn change_healthyness(mut ctx: Context<Data>) -> SResult<()> {
let body: HealtBody = ctx.body_json().await.map_err(ResponseError::bad_request)?;
if body.health {
set_healthy(ctx).await
} else {
set_unhealthy(ctx).await
}
}

View File

@ -0,0 +1,214 @@
use http::StatusCode;
use meilidb_core::{ProcessedUpdateResult, UpdateStatus};
use meilidb_schema::Schema;
use serde_json::json;
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::schema::SchemaBody;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
pub async fn list_indexes(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let list = ctx
.state()
.db
.indexes_names()
.map_err(ResponseError::internal)?;
Ok(tide::response::json(list))
}
pub async fn get_index_schema(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let schema = index
.main
.schema(&reader)
.map_err(ResponseError::create_index)?;
match schema {
Some(schema) => {
let schema = SchemaBody::from(schema);
Ok(tide::response::json(schema))
}
None => Ok(
tide::response::json(json!({ "message": "missing index schema" }))
.with_status(StatusCode::NOT_FOUND)
.into_response(),
),
}
}
pub async fn create_index(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesWrite)?;
let index_name = ctx.url_param("index")?;
let body = ctx.body_bytes().await.map_err(ResponseError::bad_request)?;
let schema: Option<Schema> = if body.is_empty() {
None
} else {
serde_json::from_slice::<SchemaBody>(&body)
.map_err(ResponseError::bad_request)
.map(|s| Some(s.into()))?
};
let db = &ctx.state().db;
let created_index = match db.create_index(&index_name) {
Ok(index) => index,
Err(meilidb_core::Error::IndexAlreadyExists) => db.open_index(&index_name).ok_or(
ResponseError::internal("index not found but must have been found"),
)?,
Err(e) => return Err(ResponseError::create_index(e)),
};
let callback_context = ctx.state().clone();
let callback_name = index_name.clone();
db.set_update_callback(
&index_name,
Box::new(move |status| {
index_update_callback(&callback_name, &callback_context, status);
}),
);
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
match schema {
Some(schema) => {
let update_id = created_index
.schema_update(&mut writer, schema.clone())
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::CREATED)
.into_response())
}
None => Ok(Response::new(tide::Body::empty())
.with_status(StatusCode::NO_CONTENT)
.into_response()),
}
}
pub async fn update_schema(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesWrite)?;
let index_name = ctx.url_param("index")?;
let schema = ctx
.body_json::<SchemaBody>()
.await
.map_err(ResponseError::bad_request)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let index = db
.open_index(&index_name)
.ok_or(ResponseError::index_not_found(index_name))?;
let schema: meilidb_schema::Schema = schema.into();
let update_id = index
.schema_update(&mut writer, schema.clone())
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn get_update_status(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let update_id = ctx
.param::<u64>("update_id")
.map_err(|e| ResponseError::bad_parameter("update_id", e))?;
let index = ctx.index()?;
let status = index
.update_status(&reader, update_id)
.map_err(ResponseError::internal)?;
let response = match status {
UpdateStatus::Enqueued(data) => {
tide::response::json(json!({ "status": "enqueued", "data": data }))
.with_status(StatusCode::OK)
.into_response()
}
UpdateStatus::Processed(data) => {
tide::response::json(json!({ "status": "processed", "data": data }))
.with_status(StatusCode::OK)
.into_response()
}
UpdateStatus::Unknown => tide::response::json(json!({ "message": "unknown update id" }))
.with_status(StatusCode::NOT_FOUND)
.into_response(),
};
Ok(response)
}
pub async fn get_all_updates_status(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(IndexesRead)?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let index = ctx.index()?;
let all_status = index
.all_updates_status(&reader)
.map_err(ResponseError::internal)?;
let response = tide::response::json(all_status)
.with_status(StatusCode::OK)
.into_response();
Ok(response)
}
pub async fn delete_index(ctx: Context<Data>) -> SResult<StatusCode> {
ctx.is_allowed(IndexesWrite)?;
let index_name = ctx.url_param("index")?;
let found = ctx
.state()
.db
.delete_index(&index_name)
.map_err(ResponseError::internal)?;
if found {
Ok(StatusCode::OK)
} else {
Ok(StatusCode::NOT_FOUND)
}
}
pub fn index_update_callback(index_name: &str, data: &Data, _status: ProcessedUpdateResult) {
let env = &data.db.env;
let mut writer = env.write_txn().unwrap();
data.compute_stats(&mut writer, &index_name).unwrap();
data.set_last_update(&mut writer, &index_name).unwrap();
writer.commit().unwrap();
}

View File

@ -0,0 +1,189 @@
use chrono::serde::ts_seconds;
use chrono::{DateTime, Utc};
use heed::types::{SerdeBincode, Str};
use http::StatusCode;
use rand::seq::SliceRandom;
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::models::token::*;
use crate::Data;
fn generate_api_key() -> String {
let mut rng = rand::thread_rng();
let sample = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
sample
.choose_multiple(&mut rng, 40)
.map(|c| *c as char)
.collect()
}
pub async fn list(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let db = &ctx.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let common_store = db.common_store();
let mut response: Vec<Token> = Vec::new();
let iter = common_store
.prefix_iter::<Str, SerdeBincode<Token>>(&reader, TOKEN_PREFIX_KEY)
.map_err(ResponseError::internal)?;
for result in iter {
let (_, token) = result.map_err(ResponseError::internal)?;
response.push(token);
}
Ok(tide::response::json(response))
}
pub async fn get(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let request_key = ctx.url_param("key")?;
let db = &ctx.state().db;
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, request_key);
let token_config = db
.common_store()
.get::<Str, SerdeBincode<Token>>(&reader, &token_key)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found(format!(
"token key: {}",
token_key
)))?;
Ok(tide::response::json(token_config))
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct CreatedRequest {
description: String,
acl: Vec<ACL>,
indexes: Vec<Wildcard>,
#[serde(with = "ts_seconds")]
expires_at: DateTime<Utc>,
}
pub async fn create(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let data: CreatedRequest = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let key = generate_api_key();
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, key);
let token_definition = Token {
key,
description: data.description,
acl: data.acl,
indexes: data.indexes,
expires_at: data.expires_at,
created_at: Utc::now(),
updated_at: Utc::now(),
revoked: false,
};
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
db.common_store()
.put::<Str, SerdeBincode<Token>>(&mut writer, &token_key, &token_definition)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
Ok(tide::response::json(token_definition)
.with_status(StatusCode::CREATED)
.into_response())
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct UpdatedRequest {
description: Option<String>,
acl: Option<Vec<ACL>>,
indexes: Option<Vec<Wildcard>>,
}
pub async fn update(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let request_key = ctx.url_param("key")?;
let data: UpdatedRequest = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = db.common_store();
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, request_key);
let mut token_config = common_store
.get::<Str, SerdeBincode<Token>>(&writer, &token_key)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found(format!(
"token key: {}",
token_key
)))?;
// apply the modifications
if let Some(description) = data.description {
token_config.description = description;
}
if let Some(acl) = data.acl {
token_config.acl = acl;
}
if let Some(indexes) = data.indexes {
token_config.indexes = indexes;
}
token_config.updated_at = Utc::now();
common_store
.put::<Str, SerdeBincode<Token>>(&mut writer, &token_key, &token_config)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
Ok(tide::response::json(token_config)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn delete(ctx: Context<Data>) -> SResult<StatusCode> {
ctx.is_allowed(Admin)?;
let request_key = ctx.url_param("key")?;
let db = &ctx.state().db;
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let common_store = db.common_store();
let token_key = format!("{}{}", TOKEN_PREFIX_KEY, request_key);
common_store
.delete::<Str>(&mut writer, &token_key)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
Ok(StatusCode::ACCEPTED)
}

View File

@ -0,0 +1,112 @@
use crate::data::Data;
pub mod document;
pub mod health;
pub mod index;
pub mod key;
pub mod search;
pub mod setting;
pub mod stats;
pub mod stop_words;
pub mod synonym;
pub fn load_routes(app: &mut tide::App<Data>) {
app.at("").nest(|router| {
router.at("/indexes").nest(|router| {
router.at("/").get(index::list_indexes);
router.at("/search").post(search::search_multi_index);
router.at("/:index").nest(|router| {
router.at("/search").get(search::search_with_url_query);
router.at("/updates").nest(|router| {
router.at("/").get(index::get_all_updates_status);
router.at("/:update_id").get(index::get_update_status);
});
router
.at("/")
.get(index::get_index_schema)
.post(index::create_index)
.put(index::update_schema)
.delete(index::delete_index);
router.at("/documents").nest(|router| {
router
.at("/")
.get(document::browse_documents)
.post(document::add_or_replace_multiple_documents)
.put(document::add_or_update_multiple_documents)
.delete(document::clear_all_documents);
router.at("/:identifier").nest(|router| {
router
.at("/")
.get(document::get_document)
.delete(document::delete_document);
});
router
.at("/delete")
.post(document::delete_multiple_documents);
});
router.at("/synonym").nest(|router| {
router.at("/").get(synonym::list).post(synonym::create);
router
.at("/:synonym")
.get(synonym::get)
.put(synonym::update)
.delete(synonym::delete);
router.at("/batch").post(synonym::batch_write);
router.at("/clear").post(synonym::clear);
});
router.at("/stop-words").nest(|router| {
router
.at("/")
.get(stop_words::list)
.put(stop_words::add)
.delete(stop_words::delete);
});
router
.at("/settings")
.get(setting::get)
.post(setting::update);
});
});
router.at("/keys").nest(|router| {
router.at("/").get(key::list).post(key::create);
router
.at("/:key")
.get(key::get)
.put(key::update)
.delete(key::delete);
});
});
// Private
app.at("").nest(|router| {
router
.at("/health")
.get(health::get_health)
.post(health::set_healthy)
.put(health::change_healthyness)
.delete(health::set_unhealthy);
router.at("/stats").get(stats::get_stats);
router.at("/stats/:index").get(stats::index_stat);
router.at("/version").get(stats::get_version);
router.at("/sys-info").get(stats::get_sys_info);
router
.at("/sys-info/pretty")
.get(stats::get_sys_info_pretty);
});
}

View File

@ -0,0 +1,231 @@
use std::collections::HashMap;
use std::collections::HashSet;
use std::time::Duration;
use meilidb_core::Index;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use serde::{Deserialize, Serialize};
use tide::querystring::ContextExt as QSContextExt;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::meilidb::{Error, IndexSearchExt, SearchHit};
use crate::helpers::tide::ContextExt;
use crate::Data;
#[derive(Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct SearchQuery {
q: String,
offset: Option<usize>,
limit: Option<usize>,
attributes_to_retrieve: Option<String>,
attributes_to_search_in: Option<String>,
attributes_to_crop: Option<String>,
crop_length: Option<usize>,
attributes_to_highlight: Option<String>,
filters: Option<String>,
timeout_ms: Option<u64>,
matches: Option<bool>,
}
pub async fn search_with_url_query(ctx: Context<Data>) -> SResult<Response> {
// ctx.is_allowed(DocumentsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let query: SearchQuery = ctx
.url_query()
.map_err(|_| ResponseError::bad_request("invalid query parameter"))?;
let mut search_builder = index.new_search(query.q.clone());
if let Some(offset) = query.offset {
search_builder.offset(offset);
}
if let Some(limit) = query.limit {
search_builder.limit(limit);
}
if let Some(attributes_to_retrieve) = query.attributes_to_retrieve {
for attr in attributes_to_retrieve.split(',') {
search_builder.add_retrievable_field(attr.to_string());
}
}
if let Some(attributes_to_search_in) = query.attributes_to_search_in {
for attr in attributes_to_search_in.split(',') {
search_builder.add_retrievable_field(attr.to_string());
}
}
if let Some(attributes_to_crop) = query.attributes_to_crop {
let crop_length = query.crop_length.unwrap_or(200);
let attributes_to_crop = attributes_to_crop
.split(',')
.map(|r| (r.to_string(), crop_length))
.collect();
search_builder.attributes_to_crop(attributes_to_crop);
}
if let Some(attributes_to_highlight) = query.attributes_to_highlight {
let attributes_to_highlight = attributes_to_highlight
.split(',')
.map(ToString::to_string)
.collect();
search_builder.attributes_to_highlight(attributes_to_highlight);
}
if let Some(filters) = query.filters {
search_builder.filters(filters);
}
if let Some(timeout_ms) = query.timeout_ms {
search_builder.timeout(Duration::from_millis(timeout_ms));
}
if let Some(matches) = query.matches {
if matches {
search_builder.get_matches();
}
}
let response = match search_builder.search(&reader) {
Ok(response) => response,
Err(Error::Internal(message)) => return Err(ResponseError::Internal(message)),
Err(others) => return Err(ResponseError::bad_request(others)),
};
Ok(tide::response::json(response))
}
#[derive(Clone, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct SearchMultiBody {
indexes: HashSet<String>,
query: String,
offset: Option<usize>,
limit: Option<usize>,
attributes_to_retrieve: Option<HashSet<String>>,
attributes_to_search_in: Option<HashSet<String>>,
attributes_to_crop: Option<HashMap<String, usize>>,
attributes_to_highlight: Option<HashSet<String>>,
filters: Option<String>,
timeout_ms: Option<u64>,
matches: Option<bool>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
struct SearchMultiBodyResponse {
hits: HashMap<String, Vec<SearchHit>>,
offset: usize,
hits_per_page: usize,
processing_time_ms: usize,
query: String,
}
pub async fn search_multi_index(mut ctx: Context<Data>) -> SResult<Response> {
// ctx.is_allowed(DocumentsRead)?;
let body = ctx
.body_json::<SearchMultiBody>()
.await
.map_err(ResponseError::bad_request)?;
let mut index_list = body.clone().indexes;
for index in index_list.clone() {
if index == "*" {
index_list = ctx
.state()
.db
.indexes_names()
.map_err(ResponseError::internal)?
.into_iter()
.collect();
}
}
let mut offset = 0;
let mut count = 20;
if let Some(body_offset) = body.offset {
if let Some(limit) = body.limit {
offset = body_offset;
count = limit;
}
}
let offset = offset;
let count = count;
let db = &ctx.state().db;
let par_body = body.clone();
let responses_per_index: Vec<SResult<_>> = index_list
.into_par_iter()
.map(move |index_name| {
let index: Index = db
.open_index(&index_name)
.ok_or(ResponseError::index_not_found(&index_name))?;
let mut search_builder = index.new_search(par_body.query.clone());
search_builder.offset(offset);
search_builder.limit(count);
if let Some(attributes_to_retrieve) = par_body.attributes_to_retrieve.clone() {
search_builder.attributes_to_retrieve(attributes_to_retrieve);
}
if let Some(attributes_to_search_in) = par_body.attributes_to_search_in.clone() {
search_builder.attributes_to_search_in(attributes_to_search_in);
}
if let Some(attributes_to_crop) = par_body.attributes_to_crop.clone() {
search_builder.attributes_to_crop(attributes_to_crop);
}
if let Some(attributes_to_highlight) = par_body.attributes_to_highlight.clone() {
search_builder.attributes_to_highlight(attributes_to_highlight);
}
if let Some(filters) = par_body.filters.clone() {
search_builder.filters(filters);
}
if let Some(timeout_ms) = par_body.timeout_ms {
search_builder.timeout(Duration::from_secs(timeout_ms));
}
if let Some(matches) = par_body.matches {
if matches {
search_builder.get_matches();
}
}
let env = &db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let response = search_builder
.search(&reader)
.map_err(ResponseError::internal)?;
Ok((index_name, response))
})
.collect();
let mut hits_map = HashMap::new();
let mut max_query_time = 0;
for response in responses_per_index {
if let Ok((index_name, response)) = response {
if response.processing_time_ms > max_query_time {
max_query_time = response.processing_time_ms;
}
hits_map.insert(index_name, response.hits);
}
}
let response = SearchMultiBodyResponse {
hits: hits_map,
offset,
hits_per_page: count,
processing_time_ms: max_query_time,
query: body.query,
};
Ok(tide::response::json(response))
}

View File

@ -0,0 +1,93 @@
use std::collections::{HashMap, HashSet};
use http::StatusCode;
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct SettingBody {
pub stop_words: Option<StopWords>,
pub ranking_order: Option<RankingOrder>,
pub distinct_field: Option<DistinctField>,
pub ranking_rules: Option<RankingRules>,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RankingOrdering {
Asc,
Dsc,
}
pub type StopWords = HashSet<String>;
pub type RankingOrder = Vec<String>;
pub type DistinctField = String;
pub type RankingRules = HashMap<String, RankingOrdering>;
pub async fn get(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let settings = match index.main.customs(&reader).unwrap() {
Some(bytes) => bincode::deserialize(bytes).unwrap(),
None => SettingBody::default(),
};
Ok(tide::response::json(settings))
}
pub async fn update(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let settings: SettingBody = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut current_settings = match index.main.customs(&writer).unwrap() {
Some(bytes) => bincode::deserialize(bytes).unwrap(),
None => SettingBody::default(),
};
if let Some(stop_words) = settings.stop_words {
current_settings.stop_words = Some(stop_words);
}
if let Some(ranking_order) = settings.ranking_order {
current_settings.ranking_order = Some(ranking_order);
}
if let Some(distinct_field) = settings.distinct_field {
current_settings.distinct_field = Some(distinct_field);
}
if let Some(ranking_rules) = settings.ranking_rules {
current_settings.ranking_rules = Some(ranking_rules);
}
let bytes = bincode::serialize(&current_settings).unwrap();
let update_id = index
.customs_update(&mut writer, bytes)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -0,0 +1,334 @@
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use pretty_bytes::converter::convert;
use serde::Serialize;
use sysinfo::{NetworkExt, Pid, ProcessExt, ProcessorExt, System, SystemExt};
use tide::{Context, Response};
use walkdir::WalkDir;
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::Data;
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct IndexStatsResponse {
number_of_documents: u64,
is_indexing: bool,
last_update: Option<DateTime<Utc>>,
fields_frequency: HashMap<String, usize>,
}
pub async fn index_stat(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let index_name = ctx.url_param("index")?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let number_of_documents = index
.main
.number_of_documents(&reader)
.map_err(ResponseError::internal)?;
let fields_frequency = ctx
.state()
.fields_frequency(&reader, &index_name)
.map_err(ResponseError::internal)?
.unwrap_or_default();
let is_indexing = ctx
.state()
.is_indexing(&reader, &index_name)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found("Index not found"))?;
let last_update = ctx
.state()
.last_update(&reader, &index_name)
.map_err(ResponseError::internal)?;
let response = IndexStatsResponse {
number_of_documents,
is_indexing,
last_update,
fields_frequency,
};
Ok(tide::response::json(response))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct StatsResult {
database_size: u64,
indexes: HashMap<String, IndexStatsResponse>,
}
pub async fn get_stats(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let mut index_list = HashMap::new();
if let Ok(indexes_set) = ctx.state().db.indexes_names() {
for index_name in indexes_set {
let db = &ctx.state().db;
let env = &db.env;
let index = db.open_index(&index_name).unwrap();
let reader = env.read_txn().map_err(ResponseError::internal)?;
let number_of_documents = index
.main
.number_of_documents(&reader)
.map_err(ResponseError::internal)?;
let fields_frequency = ctx
.state()
.fields_frequency(&reader, &index_name)
.map_err(ResponseError::internal)?
.unwrap_or_default();
let is_indexing = ctx
.state()
.is_indexing(&reader, &index_name)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::not_found("Index not found"))?;
let last_update = ctx
.state()
.last_update(&reader, &index_name)
.map_err(ResponseError::internal)?;
let response = IndexStatsResponse {
number_of_documents,
is_indexing,
last_update,
fields_frequency,
};
index_list.insert(index_name, response);
}
}
let database_size = WalkDir::new(ctx.state().db_path.clone())
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.filter(|metadata| metadata.is_file())
.fold(0, |acc, m| acc + m.len());
let response = StatsResult {
database_size,
indexes: index_list,
};
Ok(tide::response::json(response))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct VersionResponse {
commit_sha: String,
build_date: String,
pkg_version: String,
}
pub async fn get_version(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
let response = VersionResponse {
commit_sha: env!("VERGEN_SHA").to_string(),
build_date: env!("VERGEN_BUILD_TIMESTAMP").to_string(),
pkg_version: env!("CARGO_PKG_VERSION").to_string(),
};
Ok(tide::response::json(response))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysGlobal {
total_memory: u64,
used_memory: u64,
total_swap: u64,
used_swap: u64,
input_data: u64,
output_data: u64,
}
impl SysGlobal {
fn new() -> SysGlobal {
SysGlobal {
total_memory: 0,
used_memory: 0,
total_swap: 0,
used_swap: 0,
input_data: 0,
output_data: 0,
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysProcess {
memory: u64,
cpu: f32,
}
impl SysProcess {
fn new() -> SysProcess {
SysProcess {
memory: 0,
cpu: 0.0,
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysInfo {
memory_usage: f64,
processor_usage: Vec<f32>,
global: SysGlobal,
process: SysProcess,
}
impl SysInfo {
fn new() -> SysInfo {
SysInfo {
memory_usage: 0.0,
processor_usage: Vec::new(),
global: SysGlobal::new(),
process: SysProcess::new(),
}
}
}
pub(crate) fn report(pid: Pid) -> SysInfo {
let mut sys = System::new();
let mut info = SysInfo::new();
info.memory_usage = sys.get_used_memory() as f64 / sys.get_total_memory() as f64 * 100.0;
for processor in sys.get_processor_list() {
info.processor_usage.push(processor.get_cpu_usage() * 100.0);
}
info.global.total_memory = sys.get_total_memory();
info.global.used_memory = sys.get_used_memory();
info.global.total_swap = sys.get_total_swap();
info.global.used_swap = sys.get_used_swap();
info.global.input_data = sys.get_network().get_income();
info.global.output_data = sys.get_network().get_outcome();
if let Some(process) = sys.get_process(pid) {
info.process.memory = process.memory();
info.process.cpu = process.cpu_usage() * 100.0;
}
sys.refresh_all();
info
}
pub async fn get_sys_info(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
Ok(tide::response::json(report(ctx.state().server_pid)))
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysGlobalPretty {
total_memory: String,
used_memory: String,
total_swap: String,
used_swap: String,
input_data: String,
output_data: String,
}
impl SysGlobalPretty {
fn new() -> SysGlobalPretty {
SysGlobalPretty {
total_memory: "None".to_owned(),
used_memory: "None".to_owned(),
total_swap: "None".to_owned(),
used_swap: "None".to_owned(),
input_data: "None".to_owned(),
output_data: "None".to_owned(),
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysProcessPretty {
memory: String,
cpu: String,
}
impl SysProcessPretty {
fn new() -> SysProcessPretty {
SysProcessPretty {
memory: "None".to_owned(),
cpu: "None".to_owned(),
}
}
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
pub(crate) struct SysInfoPretty {
memory_usage: String,
processor_usage: Vec<String>,
global: SysGlobalPretty,
process: SysProcessPretty,
}
impl SysInfoPretty {
fn new() -> SysInfoPretty {
SysInfoPretty {
memory_usage: "None".to_owned(),
processor_usage: Vec::new(),
global: SysGlobalPretty::new(),
process: SysProcessPretty::new(),
}
}
}
pub(crate) fn report_pretty(pid: Pid) -> SysInfoPretty {
let mut sys = System::new();
let mut info = SysInfoPretty::new();
info.memory_usage = format!(
"{:.1} %",
sys.get_used_memory() as f64 / sys.get_total_memory() as f64 * 100.0
);
for processor in sys.get_processor_list() {
info.processor_usage
.push(format!("{:.1} %", processor.get_cpu_usage() * 100.0));
}
info.global.total_memory = convert(sys.get_total_memory() as f64 * 1024.0);
info.global.used_memory = convert(sys.get_used_memory() as f64 * 1024.0);
info.global.total_swap = convert(sys.get_total_swap() as f64 * 1024.0);
info.global.used_swap = convert(sys.get_used_swap() as f64 * 1024.0);
info.global.input_data = convert(sys.get_network().get_income() as f64);
info.global.output_data = convert(sys.get_network().get_outcome() as f64);
if let Some(process) = sys.get_process(pid) {
info.process.memory = convert(process.memory() as f64 * 1024.0);
info.process.cpu = format!("{:.1} %", process.cpu_usage() * 100.0);
}
sys.refresh_all();
info
}
pub async fn get_sys_info_pretty(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(Admin)?;
Ok(tide::response::json(report_pretty(ctx.state().server_pid)))
}

View File

@ -0,0 +1,82 @@
use http::StatusCode;
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
pub async fn list(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let stop_words_fst = index
.main
.stop_words_fst(&reader)
.map_err(ResponseError::internal)?;
let stop_words = stop_words_fst
.unwrap_or_default()
.stream()
.into_strs()
.map_err(ResponseError::internal)?;
Ok(tide::response::json(stop_words))
}
pub async fn add(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let data: Vec<String> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut stop_words_addition = index.stop_words_addition();
for stop_word in data {
stop_words_addition.add_stop_word(stop_word);
}
let update_id = stop_words_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn delete(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let data: Vec<String> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut stop_words_deletion = index.stop_words_deletion();
for stop_word in data {
stop_words_deletion.delete_stop_word(stop_word);
}
let update_id = stop_words_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -0,0 +1,235 @@
use std::collections::HashMap;
use http::StatusCode;
use serde::{Deserialize, Serialize};
use tide::response::IntoResponse;
use tide::{Context, Response};
use crate::error::{ResponseError, SResult};
use crate::helpers::tide::ContextExt;
use crate::models::token::ACL::*;
use crate::routes::document::IndexUpdateResponse;
use crate::Data;
#[derive(Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonym {
OneWay(SynonymOneWay),
MultiWay { synonyms: Vec<String> },
}
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct SynonymOneWay {
pub input: String,
pub synonyms: Vec<String>,
}
pub type Synonyms = Vec<Synonym>;
pub async fn list(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let synonyms_fst = index
.main
.synonyms_fst(&reader)
.map_err(ResponseError::internal)?;
let synonyms_fst = synonyms_fst.unwrap_or_default();
let synonyms_list = synonyms_fst.stream().into_strs().unwrap();
let mut response = HashMap::new();
let index_synonyms = &index.synonyms;
for synonym in synonyms_list {
let alternative_list = index_synonyms
.synonyms(&reader, synonym.as_bytes())
.unwrap()
.unwrap()
.stream()
.into_strs()
.unwrap();
response.insert(synonym, alternative_list);
}
Ok(tide::response::json(response))
}
pub async fn get(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsRead)?;
let synonym = ctx.url_param("synonym")?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let synonym_list = index
.synonyms
.synonyms(&reader, synonym.as_bytes())
.unwrap()
.unwrap()
.stream()
.into_strs()
.unwrap();
Ok(tide::response::json(synonym_list))
}
pub async fn create(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let data: Synonym = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_addition = index.synonyms_addition();
match data.clone() {
Synonym::OneWay(content) => {
synonyms_addition.add_synonym(content.input, content.synonyms.into_iter())
}
Synonym::MultiWay { mut synonyms } => {
if synonyms.len() > 1 {
for _ in 0..synonyms.len() {
let (first, elems) = synonyms.split_first().unwrap();
synonyms_addition.add_synonym(first, elems.iter());
synonyms.rotate_left(1);
}
}
}
}
let update_id = synonyms_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::CREATED)
.into_response())
}
pub async fn update(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let synonym = ctx.url_param("synonym")?;
let index = ctx.index()?;
let data: Vec<String> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_addition = index.synonyms_addition();
synonyms_addition.add_synonym(synonym.clone(), data.clone().into_iter());
let update_id = synonyms_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn delete(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let synonym = ctx.url_param("synonym")?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_deletion = index.synonyms_deletion();
synonyms_deletion.delete_all_alternatives_of(synonym);
let update_id = synonyms_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn batch_write(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let data: Synonyms = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let mut synonyms_addition = index.synonyms_addition();
for raw in data {
match raw {
Synonym::OneWay(content) => {
synonyms_addition.add_synonym(content.input, content.synonyms.into_iter())
}
Synonym::MultiWay { mut synonyms } => {
if synonyms.len() > 1 {
for _ in 0..synonyms.len() {
let (first, elems) = synonyms.split_first().unwrap();
synonyms_addition.add_synonym(first, elems.iter());
synonyms.rotate_left(1);
}
}
}
}
}
let update_id = synonyms_addition
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}
pub async fn clear(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(SettingsWrite)?;
let index = ctx.index()?;
let env = &ctx.state().db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
let synonyms_fst = index
.main
.synonyms_fst(&writer)
.map_err(ResponseError::internal)?;
let synonyms_fst = synonyms_fst.unwrap_or_default();
let synonyms_list = synonyms_fst.stream().into_strs().unwrap();
let mut synonyms_deletion = index.synonyms_deletion();
for synonym in synonyms_list {
synonyms_deletion.delete_all_alternatives_of(synonym);
}
let update_id = synonyms_deletion
.finalize(&mut writer)
.map_err(ResponseError::internal)?;
writer.commit().map_err(ResponseError::internal)?;
let response_body = IndexUpdateResponse { update_id };
Ok(tide::response::json(response_body)
.with_status(StatusCode::ACCEPTED)
.into_response())
}

View File

@ -1,6 +1,6 @@
[package] [package]
name = "meilidb-schema" name = "meilidb-schema"
version = "0.1.0" version = "0.6.0"
authors = ["Kerollmops <renault.cle@gmail.com>"] authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018" edition = "2018"

View File

@ -22,7 +22,7 @@ pub const RANKED: SchemaProps = SchemaProps {
ranked: true, ranked: true,
}; };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps { pub struct SchemaProps {
#[serde(default)] #[serde(default)]
pub displayed: bool, pub displayed: bool,
@ -60,6 +60,36 @@ impl BitOr for SchemaProps {
} }
} }
impl fmt::Debug for SchemaProps {
#[allow(non_camel_case_types)]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
#[derive(Debug)]
struct DISPLAYED;
#[derive(Debug)]
struct INDEXED;
#[derive(Debug)]
struct RANKED;
let mut debug_set = f.debug_set();
if self.displayed {
debug_set.entry(&DISPLAYED);
}
if self.indexed {
debug_set.entry(&INDEXED);
}
if self.ranked {
debug_set.entry(&RANKED);
}
debug_set.finish()
}
}
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct SchemaBuilder { pub struct SchemaBuilder {
identifier: String, identifier: String,
@ -102,12 +132,12 @@ impl SchemaBuilder {
} }
} }
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Clone, PartialEq, Eq)]
pub struct Schema { pub struct Schema {
inner: Arc<InnerSchema>, inner: Arc<InnerSchema>,
} }
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Clone, PartialEq, Eq)]
struct InnerSchema { struct InnerSchema {
identifier: String, identifier: String,
attrs: HashMap<String, SchemaAttr>, attrs: HashMap<String, SchemaAttr>,
@ -139,6 +169,10 @@ impl Schema {
attributes attributes
} }
pub fn number_of_attributes(&self) -> usize {
self.inner.attrs.len()
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps { pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let (_, props) = self.inner.props[attr.0 as usize]; let (_, props) = self.inner.props[attr.0 as usize];
props props
@ -184,6 +218,16 @@ impl<'de> Deserialize<'de> for Schema {
} }
} }
impl fmt::Debug for Schema {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let builder = self.to_builder();
f.debug_struct("Schema")
.field("identifier", &builder.identifier)
.field("attributes", &builder.attributes)
.finish()
}
}
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub u16); pub struct SchemaAttr(pub u16);
@ -215,11 +259,155 @@ impl fmt::Display for SchemaAttr {
} }
} }
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Diff {
IdentChange {
old: String,
new: String,
},
AttrMove {
name: String,
old: usize,
new: usize,
},
AttrPropsChange {
name: String,
old: SchemaProps,
new: SchemaProps,
},
NewAttr {
name: String,
pos: usize,
props: SchemaProps,
},
RemovedAttr {
name: String,
},
}
pub fn diff(old: &Schema, new: &Schema) -> Vec<Diff> {
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
let mut differences = Vec::new();
let old = old.to_builder();
let new = new.to_builder();
// check if the old identifier differs from the new one
if old.identifier != new.identifier {
let old = old.identifier;
let new = new.identifier;
differences.push(IdentChange { old, new });
}
// compare all old attributes positions
// and properties with the new ones
for (pos, (name, props)) in old.attributes.iter().enumerate() {
match new.attributes.get_full(name) {
Some((npos, _, nprops)) => {
if pos != npos {
let name = name.clone();
differences.push(AttrMove {
name,
old: pos,
new: npos,
});
}
if props != nprops {
let name = name.clone();
differences.push(AttrPropsChange {
name,
old: *props,
new: *nprops,
});
}
}
None => differences.push(RemovedAttr { name: name.clone() }),
}
}
// retrieve all attributes that
// were not present in the old schema
for (pos, (name, props)) in new.attributes.iter().enumerate() {
if !old.attributes.contains_key(name) {
let name = name.clone();
differences.push(NewAttr {
name,
pos,
props: *props,
});
}
}
differences
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use std::error::Error; use std::error::Error;
#[test]
fn difference() {
use Diff::{AttrMove, AttrPropsChange, IdentChange, NewAttr, RemovedAttr};
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
builder.new_attribute("omega", INDEXED);
let old = builder.build();
let mut builder = SchemaBuilder::with_identifier("kiki");
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("alpha", DISPLAYED | INDEXED);
builder.new_attribute("delta", RANKED);
builder.new_attribute("gamma", DISPLAYED);
let new = builder.build();
let differences = diff(&old, &new);
let expected = &[
IdentChange {
old: format!("id"),
new: format!("kiki"),
},
AttrMove {
name: format!("alpha"),
old: 0,
new: 1,
},
AttrPropsChange {
name: format!("alpha"),
old: DISPLAYED,
new: DISPLAYED | INDEXED,
},
AttrMove {
name: format!("beta"),
old: 1,
new: 0,
},
AttrMove {
name: format!("gamma"),
old: 2,
new: 3,
},
AttrPropsChange {
name: format!("gamma"),
old: INDEXED,
new: DISPLAYED,
},
RemovedAttr {
name: format!("omega"),
},
NewAttr {
name: format!("delta"),
pos: 2,
props: RANKED,
},
];
assert_eq!(&differences, expected)
}
#[test] #[test]
fn serialize_deserialize() -> bincode::Result<()> { fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::with_identifier("id"); let mut builder = SchemaBuilder::with_identifier("id");
@ -303,4 +491,43 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn debug_output() {
use std::fmt::Write as _;
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut output = String::new();
let _ = write!(&mut output, "{:#?}", schema);
let expected = r#"Schema {
identifier: "id",
attributes: {
"alpha": {
DISPLAYED,
},
"beta": {
DISPLAYED,
INDEXED,
},
"gamma": {
INDEXED,
},
},
}"#;
assert_eq!(output, expected);
let mut output = String::new();
let _ = write!(&mut output, "{:?}", schema);
let expected = r#"Schema { identifier: "id", attributes: {"alpha": {DISPLAYED}, "beta": {DISPLAYED, INDEXED}, "gamma": {INDEXED}} }"#;
assert_eq!(output, expected);
}
} }

View File

@ -1,8 +1,9 @@
[package] [package]
name = "meilidb-tokenizer" name = "meilidb-tokenizer"
version = "0.1.0" version = "0.6.1"
authors = ["Kerollmops <renault.cle@gmail.com>"] authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
deunicode = "1.0.0"
slice-group-by = "0.2.4" slice-group-by = "0.2.4"

View File

@ -1,4 +1,5 @@
use self::SeparatorCategory::*; use self::SeparatorCategory::*;
use deunicode::deunicode_char;
use slice_group_by::StrGroupBy; use slice_group_by::StrGroupBy;
use std::iter::Peekable; use std::iter::Peekable;
@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool {
fn classify_separator(c: char) -> Option<SeparatorCategory> { fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c { match c {
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), c if c.is_whitespace() => Some(Soft), // whitespaces
c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
'-' | '_' | '\'' | ':' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
_ => None, _ => None,
} }