Compare commits

...

53 Commits

Author SHA1 Message Date
685016bfec Bump meilidb-core to v0.7.0 and meilidb-http to v0.2.0 2019-11-18 15:49:23 +01:00
d30e5f6231 Merge pull request #299 from meilisearch/default-update-callbacks
Prefer using a global update callback common to all indexes
2019-11-18 15:05:21 +01:00
e854d67a55 Remove useless routes and checks 2019-11-18 14:41:49 +01:00
23a89732a5 Prefer using a global update callback common to all indexes 2019-11-18 14:41:49 +01:00
3a1f41ebdb Merge pull request #305 from meilisearch/fix-example
Make easier to interact with compacted databases
2019-11-17 20:31:06 +01:00
f873761a27 Make easier to interact with compacted databases 2019-11-17 20:01:02 +01:00
ebf620c7f9 Merge pull request #302 from meilisearch/fix-dataset-schema
Rename the movies dataset schema file
2019-11-17 17:17:33 +01:00
8b92bc3421 Rename the movies dataset schema file 2019-11-17 16:45:13 +01:00
70a5aa61e9 Merge pull request #301 from meilisearch/separate-types
Move the main types to a separate library
2019-11-17 12:45:25 +01:00
a76169042f Make the serde and zerocopy meilidb-types dependencies optional 2019-11-17 12:30:39 +01:00
c9c3cfcee9 Move the main types to a separate library 2019-11-17 12:19:36 +01:00
2e60ac5359 Merge pull request #300 from meilisearch/update-dependencies
Do not use a forked fst dependency
2019-11-17 12:19:08 +01:00
2dd7751e09 Disable the fst MemMap feature 2019-11-17 11:43:00 +01:00
26bdabcdec Do not use a forked fst dependency 2019-11-17 11:14:01 +01:00
fc8c7ed77e Merge pull request #297 from meilisearch/improve-highlights
Improve the highlight formatted outputs
2019-11-15 14:28:27 +01:00
521c96354f Improve the highlight formatted outputs 2019-11-15 14:16:21 +01:00
9788779894 Merge pull request #296 from meilisearch/update-readme
Update the README
2019-11-14 21:32:32 +01:00
9b965764ab Update the README 2019-11-14 19:09:04 +01:00
9a5a543311 Merge pull request #290 from curquiza/deploy-doc
Add information in documentation in Deploy Server part
2019-11-13 16:06:27 +01:00
b18fb868e8 Add information in documentation in Deploy Server part 2019-11-13 15:37:21 +01:00
c734af55c0 Merge pull request #289 from curquiza/status204-delete-index
Change the HTTP status code on index deletion
2019-11-13 15:33:27 +01:00
810b328ad2 Change the HTTP status code on index deletion 2019-11-13 15:14:23 +01:00
0a8039d8d8 Merge pull request #285 from bidoubiwa/remove_catching_same_index_creation
Change the error catching on the index creation route
2019-11-13 15:13:51 +01:00
e51704c09a Remove the error catching on the index creation route when the index already exist 2019-11-13 14:42:59 +01:00
623a9012d5 Merge pull request #279 from bidoubiwa/new_slogan_and_resume
Slogan and Resume proposition
2019-11-13 14:41:21 +01:00
b9a185634f Slogan and Resume proposition 2019-11-13 14:31:22 +01:00
b46889b5f0 Merge pull request #282 from meilisearch/fix-ci-artifacts
Add the meilidb-http binary to the artifacts
2019-11-13 11:39:00 +01:00
ef9a0c07db Add the meilidb-http binary to the artifacts 2019-11-13 11:15:39 +01:00
3a6f3947c9 Merge pull request #281 from meilisearch/fix-attributes-to-search-in
Take attributes to search in into account
2019-11-12 18:45:40 +01:00
5c5f41d755 Take attributes to search in into account 2019-11-12 18:35:58 +01:00
6803a8fad0 Merge pull request #280 from meilisearch/format-updates-json
Format updates json
2019-11-12 18:35:25 +01:00
8e4b362e4d Fixed the display of enqueued updates 2019-11-12 18:21:59 +01:00
acb5e624c6 Add enqueued and processed datetimes 2019-11-12 18:21:59 +01:00
a98949ff1d Improve updates JSON format 2019-11-12 16:57:22 +01:00
f355280250 Merge pull request #278 from meilisearch/mit-license
Change the license to an MIT one
2019-11-12 14:35:32 +01:00
cee8d6a8d9 Change the license to an MIT one 2019-11-12 14:24:28 +01:00
27326ea069 Merge pull request #277 from bidoubiwa/add_cmd_to_compile
Add cmd line to compile binary
2019-11-12 13:55:54 +01:00
7bbe5aca5b Add cmd line to compile binary 2019-11-12 10:57:03 +01:00
1c4afe6d0f Merge pull request #276 from meilisearch/support-slash-tokenizer
Add support for back/slashes
2019-11-11 21:46:14 +01:00
2d8f9a9849 Add support for back/slashes 2019-11-11 21:23:08 +01:00
3f41681b18 Merge pull request #274 from meilisearch/enable-env-logger
Add env logger to enable logging
2019-11-11 19:13:33 +01:00
64791815fa Add env logger to enable logging 2019-11-11 19:03:38 +01:00
8a36571a74 Merge pull request #272 from meilisearch/fix-long-words
Ignore words that are too long
2019-11-10 20:07:22 +01:00
d18e775bec Ignore words that are too long 2019-11-10 17:44:27 +01:00
78381f1818 Merge pull request #271 from meilisearch/update-dependencies
Update Dependencies
2019-11-10 11:17:09 +01:00
7f33a01ae1 Update dependencies 2019-11-10 11:04:56 +01:00
d07d14d33a Update crossbeam-channel to 0.4.0 2019-11-10 11:03:22 +01:00
540d7886ab Merge pull request #266 from meilisearch/update-readme
Update the readme and add a Quick Start section
2019-11-09 13:21:22 +01:00
5a5d10af52 Add an image description of the gif 2019-11-09 13:12:01 +01:00
f95d077ef8 Improve the README a little bit by adding a quick start section 2019-11-09 13:12:01 +01:00
05dd99936f Add a gif to show a demo using crates.io 2019-11-09 12:59:39 +01:00
c086625773 Merge pull request #269 from meilisearch/repo-became-binary
Make the repository be a binary and version the Cargo.lock
2019-11-09 12:58:52 +01:00
dc17bebf4a Make the repository be a binary and version the Cargo.lock 2019-11-09 12:13:28 +01:00
41 changed files with 22801 additions and 406 deletions

1
.gitignore vendored
View File

@ -1,5 +1,4 @@
/target
Cargo.lock
**/*.csv
**/*.json_lines
**/*.rs.bk

2519
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@ members = [
"meilidb-http",
"meilidb-schema",
"meilidb-tokenizer",
"meilidb-types",
]
[profile.release]

26
LICENSE
View File

@ -1,13 +1,21 @@
“Commons Clause” License Condition v1.0
MIT License
The Software is provided to you by the Licensor under the License, as defined below, subject to the following condition.
Copyright (c) [year] [fullname]
Without limiting other conditions in the License, the grant of rights under the License will not include, and the License does not grant to you, the right to Sell the Software.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
For purposes of the foregoing, “Sell” means practicing any or all of the rights granted to you under the License to provide to third parties, for a fee or other consideration (including without limitation fees for hosting or consulting/ support services related to the Software), a product or service whose value derives, entirely or substantially, from the functionality of the Software. Any license notice or attribution required by the License must also include this Commons Clause License Condition notice.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
Software: MeiliDB
License: MIT
Licensor: MEILI SAS
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

101
README.md
View File

@ -4,7 +4,9 @@
[![dependency status](https://deps.rs/repo/github/meilisearch/MeiliDB/status.svg)](https://deps.rs/repo/github/meilisearch/MeiliDB)
[![License](https://img.shields.io/badge/license-commons%20clause-lightgrey)](https://commonsclause.com/)
A _full-text search database_ based on the fast [LMDB key-value store](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
Ultra relevant and instant full-text search API.
MeiliSearch is a powerful, fast, open-source, easy to use and deploy search engine. The search and indexation are fully customizable and handles features like typo-tolerance, filters, and ranking.
## Features
@ -27,16 +29,91 @@ You can [read the deep dive](deep-dive.md) if you want more information on the e
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/meilisearch/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `datasets/` folder.
[![crates.io demo gif](misc/crates-io-demo.gif)](https://crates.meilisearch.com)
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using HTTP. This is our current goal, [see the milestones](https://github.com/meilisearch/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
> Meili helps the Rust community find crates on [crates.meilisearch.com](https://crates.meilisearch.com)
## Quick Start
You can deploy your own instant, relevant and typo-tolerant MeiliDB search engine by yourself too.
Something similar to the demo above can be achieve by following these little three steps first.
You will need to create your own web front display to make it pretty though.
### Deploy the Server
If you have not installed Rust and its package manager `cargo` yet, go to [the installation page](https://www.rust-lang.org/tools/install).<br/>
You can deploy the server on your own machine, it will listen to HTTP requests on the 8080 port by default.
```bash
rustup override set nightly
cargo run --release
```
For more logs during the execution, run:
```bash
RUST_LOG=info cargo run --release
```
### Create an Index and Upload Some Documents
MeiliDB can serve multiple indexes, with different kinds of documents,
therefore, it is required to create the index before sending documents to it.
```bash
curl -i -X POST 'http://127.0.0.1:8080/indexes/movies'
```
Now that the server knows about our brand new index, we can send it data.
We provided you a little dataset, it is available in the `datasets/` directory.
```bash
curl -i -X POST 'http://127.0.0.1:8080/indexes/movies/documents' \
--header 'content-type: application/json' \
--data @datasets/movies/movies.json
```
### Search for Documents
The search engine is now aware of our documents and can serve those via our HTTP server again.
The [`jq` command line tool](https://stedolan.github.io/jq/) can greatly help you read the server responses.
```bash
curl 'http://127.0.0.1:8080/indexes/movies/search?q=botman'
```
```json
{
"hits": [
{
"id": "29751",
"title": "Batman Unmasked: The Psychology of the Dark Knight",
"poster": "https://image.tmdb.org/t/p/w1280/jjHu128XLARc2k4cJrblAvZe0HE.jpg",
"overview": "Delve into the world of Batman and the vigilante justice tha",
"release_date": "2008-07-15"
},
{
"id": "471474",
"title": "Batman: Gotham by Gaslight",
"poster": "https://image.tmdb.org/t/p/w1280/7souLi5zqQCnpZVghaXv0Wowi0y.jpg",
"overview": "ve Victorian Age Gotham City, Batman begins his war on crime",
"release_date": "2018-01-12"
}
],
"offset": 0,
"limit": 2,
"processingTimeMs": 1,
"query": "botman"
}
```
## Performances
With a database composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
So more than _300 000_ fields indexed for _35 million_ stored we can handle more than _2.8k req/sec_ with an average response time of _9 ms_ on an Intel i7-7700 (8) @ 4.2GHz.
With a dataset composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
So more than _300 000_ fields indexed for _35 million_ stored we can handle more than _2.8k req/sec_ with an average response time of _9 ms_ on an Intel i7-7700 (8) @ 4.2GHz.
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to simulate real users queries.
@ -51,20 +128,24 @@ Requests/sec: 2806.46
Transfer/sec: 759.17KB
```
We also indexed a dataset containing something like _12 millions_ cities names in _24 minutes_ on a machine with _8 cores_, _64 GB of RAM_ and a _300 GB NMVe_ SSD.<br/>
The resulting database was _16 GB_ and search results were between _30 ms_ and _4 seconds_ for short prefix queries.
### Notes
With Rust 1.32 the allocator has been [changed to use the system allocator](https://blog.rust-lang.org/2019/01/17/Rust-1.32.0.html#jemalloc-is-removed-by-default).
We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).
## Usage and examples
## Usage and Examples
Currently MeiliDB do not provide an http server but you can run the example binary.
MeiliDB also provides an example binary that is mostly used for features testing.
Notice that the example binary is faster to index data as it does read direct CSV files and not JSON HTTP payloads.
The _index_ subcommand has been made to create an index and inject documents into it. Using the command line below, the index will be named _movies_ and the _19 700_ movies of the `datasets/` will be injected in MeiliDB.
```bash
cargo run --release --example from_file -- \
index example.mdb datasets/movies/data.csv \
index example.mdb datasets/movies/movies.csv \
--schema datasets/movies/schema.toml
```
@ -72,8 +153,8 @@ Once the first command is done, you can query the freshly created _movies_ index
```bash
cargo run --release --example from_file -- \
search example.mdb
--number 4 \
search example.mdb \
--number-results 4 \
--filter '!adult' \
id popularity adult original_title
```

View File

@ -42,11 +42,11 @@ jobs:
displayName: 'Build MeiliDB'
- task: CopyFiles@2
inputs:
contents: '$(System.DefaultWorkingDirectory)/target/release/libmeilidb.rlib'
contents: '$(System.DefaultWorkingDirectory)/target/release/meilidb-http'
targetFolder: $(Build.ArtifactStagingDirectory)
displayName: 'Copy build'
- task: PublishBuildArtifacts@1
inputs:
artifactName: libmeilidb.rlib
artifactName: meilidb
displayName: 'Upload artifacts'

19654
datasets/movies/movies.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[package]
name = "meilidb-core"
version = "0.6.5"
version = "0.7.0"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
@ -8,14 +8,18 @@ edition = "2018"
arc-swap = "0.4.3"
bincode = "1.1.4"
byteorder = "1.3.2"
crossbeam-channel = "0.3.9"
chrono = { version = "0.4.9", features = ["serde"] }
crossbeam-channel = "0.4.0"
deunicode = "1.0.0"
env_logger = "0.7.0"
fst = { version = "0.3.5", default-features = false }
hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.5.0"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.8"
meilidb-schema = { path = "../meilidb-schema", version = "0.6.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.6.0" }
meilidb-types = { path = "../meilidb-types", version = "0.1.0" }
once_cell = "1.2.0"
ordered-float = { version = "1.0.2", features = ["serde"] }
sdset = "0.3.3"
@ -25,15 +29,6 @@ siphasher = "0.3.0"
slice-group-by = "0.2.6"
zerocopy = "0.2.8"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"
features = ["fst_automaton"]
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dev-dependencies]
assert_matches = "1.3"
csv = "1.0.7"

View File

@ -104,14 +104,14 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
let start = Instant::now();
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let update_fn =
move |_name: &str, update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(&command.index_name) {
Some(index) => index,
None => database.create_index(&command.index_name).unwrap(),
};
let done = database.set_update_callback(&command.index_name, Box::new(update_fn));
assert!(done, "could not set the index update function");
database.set_update_callback(Box::new(update_fn));
let env = &database.env;
@ -195,8 +195,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
);
if let Some(path) = command.compact_to_path {
fs::create_dir_all(&path)?;
let start = Instant::now();
let _file = database.copy_and_compact_to_path(&path)?;
let _file = database.copy_and_compact_to_path(path.join("data.mdb"))?;
println!(
"database compacted in {:.2?} at: {:?}",
start.elapsed(),

View File

@ -11,14 +11,15 @@ use log::debug;
use crate::{store, update, Index, MResult};
pub type BoxUpdateFn = Box<dyn Fn(update::ProcessedUpdateResult) + Send + Sync + 'static>;
pub type BoxUpdateFn = Box<dyn Fn(&str, update::ProcessedUpdateResult) + Send + Sync + 'static>;
type ArcSwapFn = arc_swap::ArcSwapOption<BoxUpdateFn>;
pub struct Database {
pub env: heed::Env,
common_store: heed::PolyDatabase,
indexes_store: heed::Database<Str, Unit>,
indexes: RwLock<HashMap<String, (Index, Arc<ArcSwapFn>, thread::JoinHandle<()>)>>,
indexes: RwLock<HashMap<String, (Index, thread::JoinHandle<()>)>>,
update_fn: Arc<ArcSwapFn>,
}
macro_rules! r#break_try {
@ -41,7 +42,13 @@ pub enum UpdateEvent {
pub type UpdateEvents = Receiver<UpdateEvent>;
pub type UpdateEventsEmitter = Sender<UpdateEvent>;
fn update_awaiter(receiver: UpdateEvents, env: heed::Env, update_fn: Arc<ArcSwapFn>, index: Index) {
fn update_awaiter(
receiver: UpdateEvents,
env: heed::Env,
index_name: &str,
update_fn: Arc<ArcSwapFn>,
index: Index,
) {
let mut receiver = receiver.into_iter();
while let Some(UpdateEvent::NewUpdate) = receiver.next() {
loop {
@ -68,7 +75,7 @@ fn update_awaiter(receiver: UpdateEvents, env: heed::Env, update_fn: Arc<ArcSwap
let status = break_try!(result, "update task failed");
// commit the nested transaction if the update was successful, abort it otherwise
if status.result.is_ok() {
if status.error.is_none() {
break_try!(nested_writer.commit(), "commit nested transaction failed");
} else {
nested_writer.abort()
@ -84,7 +91,7 @@ fn update_awaiter(receiver: UpdateEvents, env: heed::Env, update_fn: Arc<ArcSwap
// call the user callback when the update and the result are written consistently
if let Some(ref callback) = *update_fn.load() {
(callback)(status);
(callback)(index_name, status);
}
}
}
@ -103,6 +110,7 @@ impl Database {
let common_store = env.create_poly_database(Some("common"))?;
let indexes_store = env.create_database::<Str, Unit>(Some("indexes"))?;
let update_fn = Arc::new(ArcSwapFn::empty());
// list all indexes that needs to be opened
let mut must_open = Vec::new();
@ -128,21 +136,27 @@ impl Database {
continue;
}
};
let update_fn = Arc::new(ArcSwapFn::empty());
let env_clone = env.clone();
let index_clone = index.clone();
let name_clone = index_name.clone();
let update_fn_clone = update_fn.clone();
let handle = thread::spawn(move || {
update_awaiter(receiver, env_clone, update_fn_clone, index_clone)
update_awaiter(
receiver,
env_clone,
&name_clone,
update_fn_clone,
index_clone,
)
});
// send an update notification to make sure that
// possible pre-boot updates are consumed
sender.send(UpdateEvent::NewUpdate).unwrap();
let result = indexes.insert(index_name, (index, update_fn, handle));
let result = indexes.insert(index_name, (index, handle));
assert!(
result.is_none(),
"The index should not have been already open"
@ -154,6 +168,7 @@ impl Database {
common_store,
indexes_store,
indexes: RwLock::new(indexes),
update_fn,
})
}
@ -180,16 +195,21 @@ impl Database {
let env_clone = self.env.clone();
let index_clone = index.clone();
let no_update_fn = Arc::new(ArcSwapFn::empty());
let no_update_fn_clone = no_update_fn.clone();
let name_clone = name.to_owned();
let update_fn_clone = self.update_fn.clone();
let handle = thread::spawn(move || {
update_awaiter(receiver, env_clone, no_update_fn_clone, index_clone)
update_awaiter(
receiver,
env_clone,
&name_clone,
update_fn_clone,
index_clone,
)
});
writer.commit()?;
entry.insert((index.clone(), no_update_fn, handle));
entry.insert((index.clone(), handle));
Ok(index)
}
@ -201,7 +221,7 @@ impl Database {
let mut indexes_lock = self.indexes.write().unwrap();
match indexes_lock.remove_entry(name) {
Some((name, (index, _fn, handle))) => {
Some((name, (index, handle))) => {
// remove the index name from the list of indexes
// and clear all the LMDB dbi
let mut writer = self.env.write_txn()?;
@ -218,27 +238,13 @@ impl Database {
}
}
pub fn set_update_callback(&self, name: impl AsRef<str>, update_fn: BoxUpdateFn) -> bool {
let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) {
Some((_, current_update_fn, _)) => {
pub fn set_update_callback(&self, update_fn: BoxUpdateFn) {
let update_fn = Some(Arc::new(update_fn));
current_update_fn.swap(update_fn);
true
}
None => false,
}
self.update_fn.swap(update_fn);
}
pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool {
let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) {
Some((_, current_update_fn, _)) => {
current_update_fn.swap(None);
true
}
None => false,
}
pub fn unset_update_callback(&self) {
self.update_fn.swap(None);
}
pub fn copy_and_compact_to_path<P: AsRef<Path>>(&self, path: P) -> ZResult<File> {
@ -272,11 +278,12 @@ mod tests {
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
database.set_update_callback(Box::new(update_fn));
let schema = {
let data = r#"
@ -323,7 +330,7 @@ mod tests {
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
}
#[test]
@ -334,11 +341,12 @@ mod tests {
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
database.set_update_callback(Box::new(update_fn));
let schema = {
let data = r#"
@ -384,7 +392,58 @@ mod tests {
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_some());
}
#[test]
fn ignored_words_too_long() {
let dir = tempfile::tempdir().unwrap();
let database = Database::open_or_create(dir.path()).unwrap();
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("test").unwrap();
database.set_update_callback(Box::new(update_fn));
let schema = {
let data = r#"
identifier = "id"
[attributes."name"]
displayed = true
indexed = true
"#;
toml::from_str(data).unwrap()
};
let mut writer = env.write_txn().unwrap();
let _update_id = index.schema_update(&mut writer, schema).unwrap();
writer.commit().unwrap();
let mut additions = index.documents_addition();
let doc1 = serde_json::json!({
"id": 123,
"name": "s̷̡̢̡̧̺̜̞͕͉͉͕̜͔̟̼̥̝͍̟̖͔͔̪͉̲̹̝̣̖͎̞̤̥͓͎̭̩͕̙̩̿̀̋̅̈́̌́̏̍̄̽͂̆̾̀̿̕̚̚͜͠͠ͅͅļ̵̨̨̨̰̦̻̳̖̳͚̬̫͚̦͖͈̲̫̣̩̥̻̙̦̱̼̠̖̻̼̘̖͉̪̜̠̙͖̙̩͔̖̯̩̲̿̽͋̔̿̍̓͂̍̿͊͆̃͗̔̎͐͌̾̆͗́̆̒̔̾̅̚̚͜͜ͅͅī̵̛̦̅̔̓͂͌̾́͂͛̎̋͐͆̽̂̋̋́̾̀̉̓̏̽́̑̀͒̇͋͛̈́̃̉̏͊̌̄̽̿̏̇͘̕̚̕p̶̧̛̛̖̯̗͕̝̗̭̱͙̖̗̟̟̐͆̊̂͐̋̓̂̈́̓͊̆͌̾̾͐͋͗͌̆̿̅͆̈́̈́̉͋̍͊͗̌̓̅̈̎̇̃̎̈́̉̐̋͑̃͘̕͘d̴̢̨̛͕̘̯͖̭̮̝̝̐̊̈̅̐̀͒̀́̈́̀͌̽͛͆͑̀̽̿͛̃̋̇̎̀́̂́͘͠͝ǫ̵̨̛̮̩̘͚̬̯̖̱͍̼͑͑̓̐́̑̿̈́̔͌̂̄͐͝ģ̶̧̜͇̣̭̺̪̺̖̻͖̮̭̣̙̻͒͊͗̓̓͒̀̀ͅ",
});
additions.update_document(doc1);
let mut writer = env.write_txn().unwrap();
let update_id = additions.finalize(&mut writer).unwrap();
writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
}
#[test]
@ -395,11 +454,12 @@ mod tests {
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
database.set_update_callback(Box::new(update_fn));
let schema = {
let data = r#"
@ -474,7 +534,7 @@ mod tests {
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
reader.abort();
let mut additions = index.documents_addition();
@ -508,7 +568,7 @@ mod tests {
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
// even try to search for a document
let results = index.query_builder().query(&reader, "21 ", 0..20).unwrap();
@ -554,7 +614,7 @@ mod tests {
// check if it has been accepted
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_some());
}
#[test]
@ -565,11 +625,12 @@ mod tests {
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
database.set_update_callback(Box::new(update_fn));
let schema = {
let data = r#"
@ -618,7 +679,7 @@ mod tests {
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
let document: Option<IgnoredAny> = index.document(&reader, None, DocumentId(25)).unwrap();
assert!(document.is_none());
@ -642,11 +703,12 @@ mod tests {
let env = &database.env;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("test").unwrap();
let done = database.set_update_callback("test", Box::new(update_fn));
assert!(done, "could not set the index update function");
database.set_update_callback(Box::new(update_fn));
let schema = {
let data = r#"
@ -698,7 +760,7 @@ mod tests {
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
let document: Option<IgnoredAny> = index.document(&reader, None, DocumentId(25)).unwrap();
assert!(document.is_none());
@ -741,7 +803,7 @@ mod tests {
let reader = env.read_txn().unwrap();
let result = index.update_status(&reader, update_id).unwrap();
assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok());
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
let document: Option<serde_json::Value> = index
.document(&reader, None, DocumentId(7900334843754999545))

View File

@ -25,79 +25,7 @@ pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
use ::serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes};
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(
Debug,
Copy,
Clone,
Eq,
PartialEq,
PartialOrd,
Ord,
Hash,
Serialize,
Deserialize,
AsBytes,
FromBytes,
)]
#[repr(C)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Highlight {
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
/// The position in bytes where the word was found.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
/// The length in bytes of the found word.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_length: u16,
}
pub use meilidb_types::{DocIndex, DocumentId, Highlight};
#[doc(hidden)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]

View File

@ -7,6 +7,8 @@ use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf;
const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer {
@ -128,6 +130,8 @@ fn index_token(
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
if word.len() <= WORD_LENGTH_LIMIT {
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
@ -138,6 +142,7 @@ fn index_token(
let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower && !unidecoded.is_empty() {
let word = Vec::from(unidecoded);
if word.len() <= WORD_LENGTH_LIMIT {
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
@ -146,6 +151,8 @@ fn index_token(
}
}
}
}
}
None => return false,
}
}

View File

@ -39,7 +39,7 @@ impl DocsWords {
match self.docs_words.get(reader, &document_id)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}

View File

@ -31,7 +31,7 @@ impl Main {
match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
@ -68,7 +68,7 @@ impl Main {
match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
@ -86,7 +86,7 @@ impl Main {
match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}

View File

@ -221,7 +221,7 @@ impl Index {
&self,
reader: &heed::RoTxn,
update_id: u64,
) -> MResult<update::UpdateStatus> {
) -> MResult<Option<update::UpdateStatus>> {
update::update_status(reader, self.updates, self.updates_results, update_id)
}
@ -234,19 +234,21 @@ impl Index {
updates.reserve(last_id as usize);
for id in 0..=last_id {
let update = self.update_status(reader, id)?;
if let Some(update) = self.update_status(reader, id)? {
updates.push(update);
last_update_result_id = id;
}
}
}
// retrieve all enqueued updates
if let Some((last_id, _)) = self.updates.last_update_id(reader)? {
for id in last_update_result_id + 1..last_id {
let update = self.update_status(reader, id)?;
for id in last_update_result_id + 1..=last_id {
if let Some(update) = self.update_status(reader, id)? {
updates.push(update);
}
}
}
Ok(updates)
}

View File

@ -30,7 +30,7 @@ impl Synonyms {
match self.synonyms.get(reader, word)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}

View File

@ -1,12 +1,11 @@
use super::BEU64;
use crate::update::ProcessedUpdateResult;
use heed::types::{OwnedType, SerdeBincode};
use heed::types::{OwnedType, SerdeJson};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct UpdatesResults {
pub(crate) updates_results:
heed::Database<OwnedType<BEU64>, SerdeBincode<ProcessedUpdateResult>>,
pub(crate) updates_results: heed::Database<OwnedType<BEU64>, SerdeJson<ProcessedUpdateResult>>,
}
impl UpdatesResults {

View File

@ -26,7 +26,7 @@ pub fn push_clear_all(
updates_results_store: store::UpdatesResults,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::ClearAll;
let update = Update::clear_all();
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -18,7 +18,7 @@ pub fn push_customs_update(
) -> ZResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Customs(customs);
let update = Update::customs(customs);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -91,9 +91,9 @@ pub fn push_documents_addition<D: serde::Serialize>(
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = if is_partial {
Update::DocumentsPartial(values)
Update::documents_partial(values)
} else {
Update::DocumentsAddition(values)
Update::documents_addition(values)
};
updates_store.put_update(writer, last_update_id, &update)?;

View File

@ -76,7 +76,7 @@ pub fn push_documents_deletion(
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::DocumentsDeletion(deletion);
let update = Update::documents_deletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -22,8 +22,9 @@ pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::cmp;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::time::{Duration, Instant};
use std::time::Instant;
use chrono::{DateTime, Utc};
use heed::Result as ZResult;
use log::debug;
use serde::{Deserialize, Serialize};
@ -32,7 +33,85 @@ use crate::{store, DocumentId, MResult};
use meilidb_schema::Schema;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Update {
pub struct Update {
data: UpdateData,
enqueued_at: DateTime<Utc>,
}
impl Update {
fn clear_all() -> Update {
Update {
data: UpdateData::ClearAll,
enqueued_at: Utc::now(),
}
}
fn schema(data: Schema) -> Update {
Update {
data: UpdateData::Schema(data),
enqueued_at: Utc::now(),
}
}
fn customs(data: Vec<u8>) -> Update {
Update {
data: UpdateData::Customs(data),
enqueued_at: Utc::now(),
}
}
fn documents_addition(data: Vec<HashMap<String, serde_json::Value>>) -> Update {
Update {
data: UpdateData::DocumentsAddition(data),
enqueued_at: Utc::now(),
}
}
fn documents_partial(data: Vec<HashMap<String, serde_json::Value>>) -> Update {
Update {
data: UpdateData::DocumentsPartial(data),
enqueued_at: Utc::now(),
}
}
fn documents_deletion(data: Vec<DocumentId>) -> Update {
Update {
data: UpdateData::DocumentsDeletion(data),
enqueued_at: Utc::now(),
}
}
fn synonyms_addition(data: BTreeMap<String, Vec<String>>) -> Update {
Update {
data: UpdateData::SynonymsAddition(data),
enqueued_at: Utc::now(),
}
}
fn synonyms_deletion(data: BTreeMap<String, Option<Vec<String>>>) -> Update {
Update {
data: UpdateData::SynonymsDeletion(data),
enqueued_at: Utc::now(),
}
}
fn stop_words_addition(data: BTreeSet<String>) -> Update {
Update {
data: UpdateData::StopWordsAddition(data),
enqueued_at: Utc::now(),
}
}
fn stop_words_deletion(data: BTreeSet<String>) -> Update {
Update {
data: UpdateData::StopWordsDeletion(data),
enqueued_at: Utc::now(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateData {
ClearAll,
Schema(Schema),
Customs(Vec<u8>),
@ -45,33 +124,31 @@ pub enum Update {
StopWordsDeletion(BTreeSet<String>),
}
impl Update {
impl UpdateData {
pub fn update_type(&self) -> UpdateType {
match self {
Update::ClearAll => UpdateType::ClearAll,
Update::Schema(schema) => UpdateType::Schema {
schema: schema.clone(),
},
Update::Customs(_) => UpdateType::Customs,
Update::DocumentsAddition(addition) => UpdateType::DocumentsAddition {
UpdateData::ClearAll => UpdateType::ClearAll,
UpdateData::Schema(_) => UpdateType::Schema,
UpdateData::Customs(_) => UpdateType::Customs,
UpdateData::DocumentsAddition(addition) => UpdateType::DocumentsAddition {
number: addition.len(),
},
Update::DocumentsPartial(addition) => UpdateType::DocumentsPartial {
UpdateData::DocumentsPartial(addition) => UpdateType::DocumentsPartial {
number: addition.len(),
},
Update::DocumentsDeletion(deletion) => UpdateType::DocumentsDeletion {
UpdateData::DocumentsDeletion(deletion) => UpdateType::DocumentsDeletion {
number: deletion.len(),
},
Update::SynonymsAddition(addition) => UpdateType::SynonymsAddition {
UpdateData::SynonymsAddition(addition) => UpdateType::SynonymsAddition {
number: addition.len(),
},
Update::SynonymsDeletion(deletion) => UpdateType::SynonymsDeletion {
UpdateData::SynonymsDeletion(deletion) => UpdateType::SynonymsDeletion {
number: deletion.len(),
},
Update::StopWordsAddition(addition) => UpdateType::StopWordsAddition {
UpdateData::StopWordsAddition(addition) => UpdateType::StopWordsAddition {
number: addition.len(),
},
Update::StopWordsDeletion(deletion) => UpdateType::StopWordsDeletion {
UpdateData::StopWordsDeletion(deletion) => UpdateType::StopWordsDeletion {
number: deletion.len(),
},
}
@ -79,9 +156,10 @@ impl Update {
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "name")]
pub enum UpdateType {
ClearAll,
Schema { schema: Schema },
Schema,
Customs,
DocumentsAddition { number: usize },
DocumentsPartial { number: usize },
@ -92,30 +170,36 @@ pub enum UpdateType {
StopWordsDeletion { number: usize },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetailedDuration {
pub main: Duration,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessedUpdateResult {
pub update_id: u64,
#[serde(rename = "type")]
pub update_type: UpdateType,
pub result: Result<(), String>,
pub detailed_duration: DetailedDuration,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
pub duration: f64, // in seconds
pub enqueued_at: DateTime<Utc>,
pub processed_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnqueuedUpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
pub enqueued_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", tag = "status")]
pub enum UpdateStatus {
Enqueued(EnqueuedUpdateResult),
Processed(ProcessedUpdateResult),
Unknown,
Enqueued {
#[serde(flatten)]
content: EnqueuedUpdateResult,
},
Processed {
#[serde(flatten)]
content: ProcessedUpdateResult,
},
}
pub fn update_status(
@ -123,19 +207,19 @@ pub fn update_status(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
update_id: u64,
) -> MResult<UpdateStatus> {
) -> MResult<Option<UpdateStatus>> {
match updates_results_store.update_result(reader, update_id)? {
Some(result) => Ok(UpdateStatus::Processed(result)),
None => {
if let Some(update) = updates_store.get(reader, update_id)? {
Ok(UpdateStatus::Enqueued(EnqueuedUpdateResult {
Some(result) => Ok(Some(UpdateStatus::Processed { content: result })),
None => match updates_store.get(reader, update_id)? {
Some(update) => Ok(Some(UpdateStatus::Enqueued {
content: EnqueuedUpdateResult {
update_id,
update_type: update.update_type(),
}))
} else {
Ok(UpdateStatus::Unknown)
}
}
update_type: update.data.update_type(),
enqueued_at: update.enqueued_at,
},
})),
None => Ok(None),
},
}
}
@ -164,8 +248,10 @@ pub fn update_task<'a, 'b>(
) -> MResult<ProcessedUpdateResult> {
debug!("Processing update number {}", update_id);
let (update_type, result, duration) = match update {
Update::ClearAll => {
let Update { enqueued_at, data } = update;
let (update_type, result, duration) = match data {
UpdateData::ClearAll => {
let start = Instant::now();
let update_type = UpdateType::ClearAll;
@ -180,12 +266,10 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::Schema(schema) => {
UpdateData::Schema(schema) => {
let start = Instant::now();
let update_type = UpdateType::Schema {
schema: schema.clone(),
};
let update_type = UpdateType::Schema;
let result = apply_schema_update(
writer,
&schema,
@ -198,7 +282,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::Customs(customs) => {
UpdateData::Customs(customs) => {
let start = Instant::now();
let update_type = UpdateType::Customs;
@ -206,7 +290,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::DocumentsAddition(documents) => {
UpdateData::DocumentsAddition(documents) => {
let start = Instant::now();
let update_type = UpdateType::DocumentsAddition {
@ -225,7 +309,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::DocumentsPartial(documents) => {
UpdateData::DocumentsPartial(documents) => {
let start = Instant::now();
let update_type = UpdateType::DocumentsPartial {
@ -244,7 +328,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::DocumentsDeletion(documents) => {
UpdateData::DocumentsDeletion(documents) => {
let start = Instant::now();
let update_type = UpdateType::DocumentsDeletion {
@ -263,7 +347,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::SynonymsAddition(synonyms) => {
UpdateData::SynonymsAddition(synonyms) => {
let start = Instant::now();
let update_type = UpdateType::SynonymsAddition {
@ -274,7 +358,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::SynonymsDeletion(synonyms) => {
UpdateData::SynonymsDeletion(synonyms) => {
let start = Instant::now();
let update_type = UpdateType::SynonymsDeletion {
@ -285,7 +369,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::StopWordsAddition(stop_words) => {
UpdateData::StopWordsAddition(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsAddition {
@ -297,7 +381,7 @@ pub fn update_task<'a, 'b>(
(update_type, result, start.elapsed())
}
Update::StopWordsDeletion(stop_words) => {
UpdateData::StopWordsDeletion(stop_words) => {
let start = Instant::now();
let update_type = UpdateType::StopWordsDeletion {
@ -323,12 +407,13 @@ pub fn update_task<'a, 'b>(
update_id, update_type, result
);
let detailed_duration = DetailedDuration { main: duration };
let status = ProcessedUpdateResult {
update_id,
update_type,
result: result.map_err(|e| e.to_string()),
detailed_duration,
error: result.map_err(|e| e.to_string()).err(),
duration: duration.as_secs_f64(),
enqueued_at,
processed_at: Utc::now(),
};
Ok(status)

View File

@ -68,7 +68,7 @@ pub fn push_schema_update(
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Schema(schema);
let update = Update::schema(schema);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -53,7 +53,7 @@ pub fn push_stop_words_addition(
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsAddition(addition);
let update = Update::stop_words_addition(addition);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -54,7 +54,7 @@ pub fn push_stop_words_deletion(
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::StopWordsDeletion(deletion);
let update = Update::stop_words_deletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -63,7 +63,7 @@ pub fn push_synonyms_addition(
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsAddition(addition);
let update = Update::synonyms_addition(addition);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -70,7 +70,7 @@ pub fn push_synonyms_deletion(
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsDeletion(deletion);
let update = Update::synonyms_deletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -1,6 +1,6 @@
[package]
name = "meilidb-http"
version = "0.1.1"
version = "0.2.0"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",
@ -10,7 +10,8 @@ edition = "2018"
[dependencies]
bincode = "1.2.0"
chrono = { version = "0.4.9", features = ["serde"] }
crossbeam-channel = "0.3.9"
crossbeam-channel = "0.4.0"
env_logger = "0.7.1"
envconfig = "0.5.1"
envconfig_derive = "0.5.1"
heed = "0.5.0"
@ -19,7 +20,7 @@ indexmap = { version = "1.3.0", features = ["serde-1"] }
jemallocator = "0.3.2"
log = "0.4.8"
main_error = "0.1.0"
meilidb-core = { path = "../meilidb-core", version = "0.6.0" }
meilidb-core = { path = "../meilidb-core", version = "0.7.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.6.0" }
pretty-bytes = "0.2.2"
rand = "0.7.2"
@ -33,7 +34,7 @@ walkdir = "2.2.9"
[dependencies.async-compression]
default-features = false
features = ["stream", "gzip", "zlib", "brotli", "zstd"]
version = "0.1.0-alpha.7"
version = "=0.1.0-alpha.7"
[dependencies.tide]
git = "https://github.com/rustasync/tide"

View File

@ -1,6 +1,5 @@
use std::collections::HashMap;
use std::ops::Deref;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use chrono::{DateTime, Utc};
@ -35,7 +34,6 @@ pub struct DataInner {
pub db_path: String,
pub admin_token: Option<String>,
pub server_pid: Pid,
pub accept_updates: Arc<AtomicBool>,
}
impl DataInner {
@ -70,25 +68,6 @@ impl DataInner {
.map_err(Into::into)
}
pub fn last_backup(&self, reader: &heed::RoTxn) -> MResult<Option<DateTime<Utc>>> {
match self
.db
.common_store()
.get::<Str, SerdeDatetime>(&reader, "last-backup")?
{
Some(datetime) => Ok(Some(datetime)),
None => Ok(None),
}
}
pub fn set_last_backup(&self, writer: &mut heed::RwTxn) -> MResult<()> {
self.db
.common_store()
.put::<Str, SerdeDatetime>(writer, "last-backup", &Utc::now())?;
Ok(())
}
pub fn fields_frequency(
&self,
reader: &heed::RoTxn,
@ -143,14 +122,6 @@ impl DataInner {
Ok(())
}
pub fn stop_accept_updates(&self) {
self.accept_updates.store(false, Ordering::Relaxed);
}
pub fn accept_updates(&self) -> bool {
self.accept_updates.load(Ordering::Relaxed)
}
}
impl Data {
@ -160,30 +131,22 @@ impl Data {
let server_pid = sysinfo::get_current_pid().unwrap();
let db = Arc::new(Database::open_or_create(opt.database_path.clone()).unwrap());
let accept_updates = Arc::new(AtomicBool::new(true));
let inner_data = DataInner {
db: db.clone(),
db_path,
admin_token,
server_pid,
accept_updates,
};
let data = Data {
inner: Arc::new(inner_data),
};
for index_name in db.indexes_names().unwrap() {
let callback_context = data.clone();
let callback_name = index_name.clone();
db.set_update_callback(
index_name,
Box::new(move |status| {
index_update_callback(&callback_name, &callback_context, status);
}),
);
}
db.set_update_callback(Box::new(move |index_name, status| {
index_update_callback(&index_name, &callback_context, status);
}));
data
}

View File

@ -17,6 +17,7 @@ pub enum ResponseError {
DocumentNotFound(String),
MissingHeader(String),
BadParameter(String, String),
OpenIndex(String),
CreateIndex(String),
Maintenance,
}
@ -54,6 +55,10 @@ impl ResponseError {
ResponseError::BadParameter(name.to_string(), message.to_string())
}
pub fn open_index(message: impl Display) -> ResponseError {
ResponseError::OpenIndex(message.to_string())
}
pub fn create_index(message: impl Display) -> ResponseError {
ResponseError::CreateIndex(message.to_string())
}
@ -96,6 +101,10 @@ impl IntoResponse for ResponseError {
format!("Impossible to create index; {}", err),
StatusCode::BAD_REQUEST,
),
ResponseError::OpenIndex(err) => error(
format!("Impossible to open index; {}", err),
StatusCode::BAD_REQUEST,
),
ResponseError::Maintenance => error(
String::from("Server is in maintenance, please try again later"),
StatusCode::SERVICE_UNAVAILABLE,

View File

@ -131,6 +131,12 @@ impl<'a> SearchBuilder<'a> {
self
}
pub fn add_attribute_to_search_in(&mut self, value: String) -> &SearchBuilder {
let attributes_to_search_in = self.attributes_to_search_in.get_or_insert(HashSet::new());
attributes_to_search_in.insert(value);
self
}
pub fn attributes_to_highlight(&mut self, value: HashSet<String>) -> &SearchBuilder {
self.attributes_to_highlight = Some(value);
self
@ -229,43 +235,35 @@ impl<'a> SearchBuilder<'a> {
}
fields = Some(set);
}
let mut document: IndexMap<String, Value> = self
let document: IndexMap<String, Value> = self
.index
.document(reader, fields.as_ref(), doc.id)
.map_err(|e| Error::RetrieveDocument(doc.id.0, e.to_string()))?
.ok_or(Error::DocumentNotFound(doc.id.0))?;
let mut formatted = document.clone();
let mut matches = doc.highlights.clone();
// Crops fields if needed
if let Some(fields) = self.attributes_to_crop.clone() {
for (field, length) in fields {
let _ = crop_document(&mut document, &mut matches, &schema, &field, length);
}
if let Some(fields) = &self.attributes_to_crop {
crop_document(&mut formatted, &mut matches, &schema, fields);
}
// Transform to readable matches
let matches = calculate_matches(matches, self.attributes_to_retrieve.clone(), &schema);
if !self.matches {
if let Some(attributes_to_highlight) = self.attributes_to_highlight.clone() {
let highlights = calculate_highlights(
document.clone(),
matches.clone(),
attributes_to_highlight,
);
for (key, value) in highlights {
if let Some(content) = document.get_mut(&key) {
*content = value;
}
}
if let Some(attributes_to_highlight) = &self.attributes_to_highlight {
formatted = calculate_highlights(&formatted, &matches, attributes_to_highlight);
}
}
let matches_info = if self.matches { Some(matches) } else { None };
let hit = SearchHit {
hit: document,
document,
formatted,
matches_info,
};
@ -382,7 +380,9 @@ pub type MatchesInfos = HashMap<String, Vec<MatchPosition>>;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchHit {
#[serde(flatten)]
pub hit: IndexMap<String, Value>,
pub document: IndexMap<String, Value>,
#[serde(rename = "_formatted", skip_serializing_if = "IndexMap::is_empty")]
pub formatted: IndexMap<String, Value>,
#[serde(rename = "_matchesInfo", skip_serializing_if = "Option::is_none")]
pub matches_info: Option<MatchesInfos>,
}
@ -425,32 +425,31 @@ fn crop_document(
document: &mut IndexMap<String, Value>,
matches: &mut Vec<Highlight>,
schema: &Schema,
field: &str,
length: usize,
) -> Result<(), Error> {
fields: &HashMap<String, usize>,
) {
matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let attribute = schema
.attribute(field)
.ok_or(Error::AttributeNotFoundOnSchema(field.to_string()))?;
for (field, length) in fields {
let attribute = match schema.attribute(field) {
Some(attribute) => attribute,
None => continue,
};
let selected_matches = matches
.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attribute)
.cloned();
let original_text = match document.get(field) {
Some(Value::String(text)) => text,
Some(_) => return Err(Error::CropFieldWrongType(field.to_string())),
None => return Err(Error::AttributeNotFoundOnDocument(field.to_string())),
};
let (cropped_text, cropped_matches) = crop_text(&original_text, selected_matches, length);
document.insert(
field.to_string(),
serde_json::value::Value::String(cropped_text),
);
if let Some(Value::String(ref mut original_text)) = document.get_mut(field) {
let (cropped_text, cropped_matches) =
crop_text(original_text, selected_matches, *length);
*original_text = cropped_text;
matches.retain(|m| SchemaAttr::new(m.attribute) != attribute);
matches.extend_from_slice(&cropped_matches);
Ok(())
}
}
}
fn calculate_matches(
@ -490,13 +489,14 @@ fn calculate_matches(
}
fn calculate_highlights(
document: IndexMap<String, Value>,
matches: MatchesInfos,
attributes_to_highlight: HashSet<String>,
) -> HighlightInfos {
let mut highlight_result: HashMap<String, Value> = HashMap::new();
document: &IndexMap<String, Value>,
matches: &MatchesInfos,
attributes_to_highlight: &HashSet<String>,
) -> IndexMap<String, Value> {
let mut highlight_result = IndexMap::new();
for (attribute, matches) in matches.iter() {
if attributes_to_highlight.contains("*") || attributes_to_highlight.contains(attribute) {
if attributes_to_highlight.contains(attribute) {
if let Some(Value::String(value)) = document.get(attribute) {
let value: Vec<_> = value.chars().collect();
let mut highlighted_value = String::new();
@ -521,6 +521,7 @@ fn calculate_highlights(
};
}
}
highlight_result
}
@ -537,9 +538,10 @@ mod tests {
let document: IndexMap<String, Value> = serde_json::from_str(data).unwrap();
let mut attributes_to_highlight = HashSet::new();
attributes_to_highlight.insert("*".to_string());
attributes_to_highlight.insert("title".to_string());
attributes_to_highlight.insert("description".to_string());
let mut matches: HashMap<String, Vec<MatchPosition>> = HashMap::new();
let mut matches = HashMap::new();
let mut m = Vec::new();
m.push(MatchPosition {
@ -554,9 +556,9 @@ mod tests {
length: 9,
});
matches.insert("description".to_string(), m);
let result = super::calculate_highlights(document, matches, attributes_to_highlight);
let result = super::calculate_highlights(&document, &matches, &attributes_to_highlight);
let mut result_expected = HashMap::new();
let mut result_expected = IndexMap::new();
result_expected.insert(
"title".to_string(),
Value::String("<em>Fondation</em> (Isaac ASIMOV)".to_string()),

View File

@ -8,4 +8,4 @@ pub mod models;
pub mod option;
pub mod routes;
use self::data::Data;
pub use self::data::Data;

View File

@ -7,15 +7,23 @@ use tide_log::RequestLogger;
use meilidb_http::data::Data;
use meilidb_http::option::Opt;
use meilidb_http::routes;
use meilidb_http::routes::index::index_update_callback;
#[cfg(not(target_os = "macos"))]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
pub fn main() -> Result<(), MainError> {
let opt = Opt::new();
env_logger::init();
let opt = Opt::new();
let data = Data::new(opt.clone());
let data_cloned = data.clone();
data.db.set_update_callback(Box::new(move |name, status| {
index_update_callback(name, &data_cloned, status);
}));
let mut app = tide::App::with_state(data);
app.middleware(

View File

@ -45,10 +45,6 @@ pub struct IndexUpdateResponse {
pub async fn delete_document(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let index = ctx.index()?;
let identifier = ctx.identifier()?;
let document_id = meilidb_core::serde::compute_document_id(identifier.clone());
@ -154,9 +150,6 @@ fn infered_schema(document: &IndexMap<String, Value>) -> Option<meilidb_schema::
async fn update_multiple_documents(mut ctx: Context<Data>, is_partial: bool) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let data: Vec<IndexMap<String, Value>> =
ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
@ -211,9 +204,7 @@ pub async fn add_or_update_multiple_documents(ctx: Context<Data>) -> SResult<Res
pub async fn delete_multiple_documents(mut ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let data: Vec<Value> = ctx.body_json().await.map_err(ResponseError::bad_request)?;
let index = ctx.index()?;
@ -243,9 +234,7 @@ pub async fn delete_multiple_documents(mut ctx: Context<Data>) -> SResult<Respon
pub async fn clear_all_documents(ctx: Context<Data>) -> SResult<Response> {
ctx.is_allowed(DocumentsWrite)?;
if !ctx.state().accept_updates() {
return Err(ResponseError::Maintenance);
}
let index = ctx.index()?;
let env = &ctx.state().db.env;

View File

@ -1,5 +1,5 @@
use http::StatusCode;
use meilidb_core::{ProcessedUpdateResult, UpdateStatus};
use meilidb_core::ProcessedUpdateResult;
use meilidb_schema::Schema;
use serde_json::json;
use tide::response::IntoResponse;
@ -66,21 +66,9 @@ pub async fn create_index(mut ctx: Context<Data>) -> SResult<Response> {
let created_index = match db.create_index(&index_name) {
Ok(index) => index,
Err(meilidb_core::Error::IndexAlreadyExists) => db.open_index(&index_name).ok_or(
ResponseError::internal("index not found but must have been found"),
)?,
Err(e) => return Err(ResponseError::create_index(e)),
};
let callback_context = ctx.state().clone();
let callback_name = index_name.clone();
db.set_update_callback(
&index_name,
Box::new(move |status| {
index_update_callback(&callback_name, &callback_context, status);
}),
);
let env = &db.env;
let mut writer = env.write_txn().map_err(ResponseError::internal)?;
@ -150,17 +138,10 @@ pub async fn get_update_status(ctx: Context<Data>) -> SResult<Response> {
.map_err(ResponseError::internal)?;
let response = match status {
UpdateStatus::Enqueued(data) => {
tide::response::json(json!({ "status": "enqueued", "data": data }))
Some(status) => tide::response::json(status)
.with_status(StatusCode::OK)
.into_response()
}
UpdateStatus::Processed(data) => {
tide::response::json(json!({ "status": "processed", "data": data }))
.with_status(StatusCode::OK)
.into_response()
}
UpdateStatus::Unknown => tide::response::json(json!({ "message": "unknown update id" }))
.into_response(),
None => tide::response::json(json!({ "message": "unknown update id" }))
.with_status(StatusCode::NOT_FOUND)
.into_response(),
};
@ -197,7 +178,7 @@ pub async fn delete_index(ctx: Context<Data>) -> SResult<StatusCode> {
.map_err(ResponseError::internal)?;
if found {
Ok(StatusCode::OK)
Ok(StatusCode::NO_CONTENT)
} else {
Ok(StatusCode::NOT_FOUND)
}

View File

@ -36,6 +36,12 @@ pub async fn search_with_url_query(ctx: Context<Data>) -> SResult<Response> {
let env = &ctx.state().db.env;
let reader = env.read_txn().map_err(ResponseError::internal)?;
let schema = index
.main
.schema(&reader)
.map_err(ResponseError::internal)?
.ok_or(ResponseError::open_index("No Schema found"))?;
let query: SearchQuery = ctx
.url_query()
.map_err(|_| ResponseError::bad_request("invalid query parameter"))?;
@ -56,23 +62,36 @@ pub async fn search_with_url_query(ctx: Context<Data>) -> SResult<Response> {
}
if let Some(attributes_to_search_in) = query.attributes_to_search_in {
for attr in attributes_to_search_in.split(',') {
search_builder.add_retrievable_field(attr.to_string());
search_builder.add_attribute_to_search_in(attr.to_string());
}
}
if let Some(attributes_to_crop) = query.attributes_to_crop {
let crop_length = query.crop_length.unwrap_or(200);
if attributes_to_crop == "*" {
let attributes_to_crop = schema
.iter()
.map(|(attr, ..)| (attr.to_string(), crop_length))
.collect();
search_builder.attributes_to_crop(attributes_to_crop);
} else {
let attributes_to_crop = attributes_to_crop
.split(',')
.map(|r| (r.to_string(), crop_length))
.collect();
search_builder.attributes_to_crop(attributes_to_crop);
}
}
if let Some(attributes_to_highlight) = query.attributes_to_highlight {
let attributes_to_highlight = attributes_to_highlight
let attributes_to_highlight = if attributes_to_highlight == "*" {
schema.iter().map(|(attr, ..)| attr.to_string()).collect()
} else {
attributes_to_highlight
.split(',')
.map(ToString::to_string)
.collect();
.collect()
};
search_builder.attributes_to_highlight(attributes_to_highlight);
}

View File

@ -47,7 +47,7 @@ fn classify_separator(c: char) -> Option<SeparatorCategory> {
c if c.is_whitespace() => Some(Soft), // whitespaces
c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
'-' | '_' | '\'' | ':' => Some(Soft),
'-' | '_' | '\'' | ':' | '/' | '\\' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
_ => None,
}

17
meilidb-types/Cargo.toml Normal file
View File

@ -0,0 +1,17 @@
[package]
name = "meilidb-types"
version = "0.1.0"
authors = ["Clément Renault <renault.cle@gmail.com>"]
edition = "2018"
[dependencies.zerocopy]
version = "0.2.8"
optional = true
[dependencies.serde]
version = "1.0.101"
features = ["derive"]
optional = true
[features]
default = ["serde", "zerocopy"]

65
meilidb-types/src/lib.rs Normal file
View File

@ -0,0 +1,65 @@
#[cfg(feature = "zerocopy")]
use zerocopy::{AsBytes, FromBytes};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[repr(C)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Highlight {
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
/// The position in bytes where the word was found.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
/// The length in bytes of the found word.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_length: u16,
}

BIN
misc/crates-io-demo.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 MiB