Compare commits

...

106 Commits
v0.1 ... v0.2.1

Author SHA1 Message Date
810dfdf656 Merge pull request #90 from Kerollmops/version-bump
Bump version to 0.2.1
2019-01-25 17:08:53 +01:00
f016652fca chore: Bump version to 0.2.1 2019-01-25 16:41:08 +01:00
6c99ebe3fa Merge pull request #89 from Kerollmops/no-more-compaction
Remove the manual compaction triggering
2019-01-25 16:40:08 +01:00
94d357985f feat: Remove the manual compaction triggering 2019-01-25 16:05:56 +01:00
fbc698567a Merge pull request #87 from Kerollmops/measure-index-loading
Display index loading times
2019-01-24 14:07:11 +01:00
aa9db14c09 chore: Display index loading times 2019-01-23 11:19:44 +01:00
61e83a1c21 Merge pull request #86 from Kerollmops/measure-indexation
Display timings of indexation operations
2019-01-16 13:32:44 +01:00
1316be5b09 chore: Display timings of indexation operations 2019-01-16 11:45:33 +01:00
4e8b0383dd Merge pull request #85 from Kerollmops/debug-more-stats
Display more stats infos
2019-01-15 14:20:28 +01:00
4fa10753c1 chore: Display more stats infos 2019-01-14 21:18:46 +01:00
2473e289e8 Merge pull request #84 from qdequele/create-server-example
Example HTTP server example can use stopwords
2019-01-14 18:55:58 +01:00
e0e5e87ed3 feat: HTTP server example can use stopwords 2019-01-14 18:21:58 +01:00
b13e61f40a Merge pull request #83 from qdequele/create-server-example
Create an example of HTTP server managing multiple databases
2019-01-14 14:35:33 +01:00
c023cb3065 feat: Create an example for HTTP server managing multiple databases 2019-01-14 13:39:54 +01:00
0a3d069fbc Merge pull request #79 from qdequele/master
Schema can be de/serialized from a json format
2019-01-12 21:50:02 +01:00
fa062ce2cf feat: Schema can be de/serialized from a json format 2019-01-12 21:05:48 +01:00
cdc6e47bf5 Merge pull request #81 from Kerollmops/update-readme
Simplify the examples command lines
2019-01-12 13:43:42 +01:00
d5f44838be doc: Simplify the examples command lines 2019-01-12 12:56:11 +01:00
5939f6e68a Merge pull request #80 from Kerollmops/version-bump
Bump version to 0.2.0
2019-01-12 12:52:08 +01:00
97edc987f8 chore: Bump version to 0.2.0 2019-01-12 12:18:29 +01:00
e4e50cecce Merge pull request #77 from Kerollmops/update-dependencies
Update the quickcheck dev-dependency
2019-01-10 22:09:44 +01:00
77e0c19749 chore: Update the quickcheck dev-dependency 2019-01-10 21:25:32 +01:00
251bccbbc3 Merge pull request #76 from Kerollmops/update-readme
Update readme
2019-01-10 21:20:39 +01:00
f7561f8552 doc: Update examples usages 2019-01-10 21:14:01 +01:00
05fd7e87ec doc: Add some wrk stats to the Readme 2019-01-10 21:13:54 +01:00
446d6a5455 Merge pull request #75 from Kerollmops/binary-group-by-mut-query-builder
Introduce binary group by in the query builder
2019-01-10 21:10:31 +01:00
78786a0007 feat: Introduce binary group by in the query builder 2019-01-10 20:13:40 +01:00
3d820a27ee Merge pull request #74 from Kerollmops/same-document-update-shadowed
Make multiple document updates shadow themselves
2019-01-10 15:57:49 +01:00
ac347d788c feat: Make multiple document updates shadow themselves 2019-01-10 15:25:24 +01:00
5627f15d41 Merge pull request #73 from Kerollmops/module-for-attribute-wordarea
Module for attribute wordarea
2019-01-10 15:23:03 +01:00
e31afc2da2 chore: Move the WordArea type to its own module 2019-01-10 13:37:22 +01:00
77c252e12a chore: Move the Attribute type to its own module 2019-01-10 11:59:42 +01:00
30c9c053c2 Merge pull request #72 from Kerollmops/wordarea-char-index
Make WordArea be based on char index and length
2019-01-09 20:53:59 +01:00
b53ef08d05 feat: Make WordArea be based on char index and length 2019-01-09 20:14:08 +01:00
86bfb173ef Merge pull request #70 from Kerollmops/fix-assert-new-attribute
Remove assert on Attribute::new()
2019-01-09 11:09:18 +01:00
8e5f834625 chore: remove assert on Attribute::new() 2019-01-08 18:46:55 +01:00
563b021679 Merge pull request #69 from tpayet/patch-1
Update README.md
2019-01-08 18:45:10 +01:00
681f721b1d Correct README typos 2019-01-08 17:09:48 +01:00
8a7c061539 Update README.md 2019-01-08 17:09:48 +01:00
8c781a4d05 Merge pull request #67 from Kerollmops/reintroduce-stop-words
Reintroduce stop words
2019-01-07 13:29:23 +01:00
de59ea495d feat: Log some update steps 2019-01-06 22:49:12 +01:00
966eda8ae5 feat: Do the sum of typos using usizes 2019-01-06 22:49:12 +01:00
32f8908d71 feat: Reintroduce stopwords for the serializer 2019-01-06 22:49:11 +01:00
a2f5e8aa25 Merge pull request #66 from Kerollmops/revert-precompute-query-index-groups
Revert precompute query index groups
2019-01-06 22:38:44 +01:00
f00b978801 Revert "feat: Pre-compute matches query index groups"
This reverts commit 039a9a4cc7.
2019-01-06 21:54:49 +01:00
a78b5d225f Revert "feat: Allow Matches to be constructed"
This reverts commit d21406a939.
2019-01-06 21:44:53 +01:00
f32a59720d Revert "feat: Introducing the Matches as_matches method"
This reverts commit ef7ba96d4a.
2019-01-06 21:44:53 +01:00
2cc5fbde1a Revert "feat: Introduce multiple Iterator impl for Matches"
This reverts commit c594597a01.
2019-01-06 21:44:53 +01:00
34d2850d28 Revert "feat: Prefer using ranges and not using unreachable!"
This reverts commit d899b86603.
2019-01-06 21:44:51 +01:00
023f62b0ce Merge pull request #65 from Kerollmops/logging
Add a little bit of logging
2019-01-06 15:55:48 +01:00
7f35b971f0 feat: Log the total number of documents to rank 2019-01-06 15:02:53 +01:00
3418adb06a feat: Add log libraries dependencies 2019-01-06 15:02:53 +01:00
510426c05c Merge pull request #64 from Kerollmops/precompute-query-index-groups
Precompute query index groups
2019-01-06 14:59:04 +01:00
c74caa0f82 feat: Sum usizes instead of little u16/u32 2019-01-06 13:54:14 +01:00
d899b86603 feat: Prefer using ranges and not using unreachable! 2019-01-06 13:54:14 +01:00
0d07af3caf fix: Filter and count the exact matching words 2019-01-06 13:54:13 +01:00
c594597a01 feat: Introduce multiple Iterator impl for Matches 2019-01-06 13:54:13 +01:00
ef7ba96d4a feat: Introducing the Matches as_matches method 2019-01-06 13:54:13 +01:00
d21406a939 feat: Allow Matches to be constructed 2019-01-06 13:54:13 +01:00
039a9a4cc7 feat: Pre-compute matches query index groups 2019-01-06 11:11:55 +01:00
40ab9e7a55 Merge pull request #63 from Kerollmops/update-rocksdb
Update RocksDB to Titan
2019-01-06 10:37:54 +01:00
d21abb50fa chore: Update RocksDB to Titan 2019-01-05 12:47:03 +01:00
3dd5e2445a Merge pull request #62 from Kerollmops/test-document-key-attr
Add tests to DocumentKeyAttr
2019-01-02 22:20:37 +01:00
7f5e6c5b6e test: Add test to the DocumentKeyAttr slice repr 2019-01-02 21:48:58 +01:00
e6d3840f12 Merge pull request #61 from Kerollmops/update-remove-kv-attributes
UpdateBuilder handles document attributes deletion
2019-01-02 18:20:14 +01:00
c05fab783a fix: Write and Read DocumentKeyAttr in big endian 2019-01-02 17:53:53 +01:00
95dc6fe904 feat: Rework the UpdateBuilder struct 2019-01-02 17:53:52 +01:00
b2e9ae4136 Merge pull request #60 from Kerollmops/improve-perfs
Improve performances
2019-01-01 17:03:41 +01:00
b070778d44 feat: Use the jemalloc global allocator in examples 2019-01-01 16:37:15 +01:00
6731025003 chore: Update group-by 2019-01-01 16:27:39 +01:00
04544c1531 feat: Expose nightly features of some dependencies 2019-01-01 16:27:08 +01:00
9dd68b4eaa Merge pull request #58 from Kerollmops/clean-up
Clean up some database functions
2019-01-01 11:43:27 +01:00
1d67012aa5 chore: Clean up some database functions 2019-01-01 01:40:20 +01:00
e723e01ec8 Merge pull request #57 from Kerollmops/clippy-pass
Clippy pass
2018-12-31 23:46:18 +01:00
7845292ea8 chore: Clippy pass 2018-12-31 23:20:30 +01:00
521df85c0d Merge pull request #55 from Kerollmops/add-benchmarks
Add benchmarks
2018-12-31 21:48:38 +01:00
dfa19582a2 test: Add benchmarks to mesure the words proximity criterion 2018-12-31 21:18:42 +01:00
87ec95f7a0 test: Add benchmarks to mesure the database 2018-12-31 21:18:37 +01:00
76ef2cceeb Merge pull request #49 from Kerollmops/serialize-any-map
Serialize any map
2018-12-31 21:11:17 +01:00
20b5a6a06e doc: Add examples for runtime defined data and Schema 2018-12-31 20:44:33 +01:00
a842e647f7 Merge pull request #56 from Kerollmops/new-index-struct
New Index structure
2018-12-31 19:55:18 +01:00
21bb38c3b0 test: Add more tests for updates ingestion 2018-12-31 19:27:21 +01:00
64d53ee1bd chore: Rework the data module structures
being able to be constructed from SharedData
2018-12-31 19:27:21 +01:00
c022fa3fca chore: Move serde related structs to their module 2018-12-31 19:26:28 +01:00
0080bf486f feat: Introduce the new Index structure
replacing the old ugly Blob system
2018-12-31 19:26:27 +01:00
6bd779f9ae feat: Improve the deserialization time of a Blob 2018-12-31 13:15:37 +01:00
a18401f47e Merge pull request #53 from Kerollmops/query-builder-filter
Distinct/QueryBuilder filtering
2018-12-29 23:11:43 +01:00
7132c3be89 feat: Allow filtering on QueryBuilder 2018-12-29 22:30:41 +01:00
aa3d059363 feat: Allow filtering on DistinctQueryBuilder 2018-12-29 22:30:41 +01:00
e2a9dbc404 feat: Introduce filtering methods for Distinct/QueryBuilder 2018-12-29 22:30:40 +01:00
a0a11faee5 Merge pull request #54 from Kerollmops/arccell-instead-of-rwlock
Prefer using ArcCell instead of RWLock for database updates
2018-12-29 22:29:35 +01:00
36ef9581aa feat: Return the database view for each update 2018-12-29 21:07:01 +01:00
f4b04dfb72 feat: Prefer doing DatabaseView updates atomically 2018-12-29 20:52:00 +01:00
cf5d56e63a Merge pull request #52 from Kerollmops/schema-toml
Schema can be de/serialized from a toml format
2018-12-28 19:59:40 +01:00
8412c14b5b feat: Schema can be toml de/serialized 2018-12-28 19:24:50 +01:00
70772eca5c Merge pull request #51 from Kerollmops/wordarea-attribute-fallible
Make the Attribute and WordArea errors recoverable
2018-12-28 18:26:19 +01:00
b27f632e14 feat: Make the Attribute and WordArea errors recoverable 2018-12-28 16:15:22 +01:00
e3bfb866e5 Merge pull request #46 from Kerollmops/schema-considers-id
Schema considers document ids
2018-12-27 12:26:57 +01:00
fa238f21ef feat: Move Database to its own module 2018-12-27 11:21:47 +01:00
444a4c1af7 feat: Make the schema consider document ids 2018-12-27 11:21:47 +01:00
2e5c5fad33 Merge pull request #45 from Kerollmops/index-length-in-docindex
Introduce the WordArea struct
2018-12-24 17:08:20 +01:00
b32c96cdc9 feat: Introduce a WordArea struct
Useful to highlight matching areas in the original text.
2018-12-24 15:58:46 +01:00
62521262e8 Merge pull request #44 from Kerollmops/real-document-id-type
Create a real DocumentId type
2018-12-24 15:41:47 +01:00
4ebae7784c feat: Create a strong DocumentId type
Forcing it to be something internal will permit to avoid possible miss comparisons to be done with other types.
2018-12-24 12:42:24 +01:00
a756ca5e3f Merge pull request #39 from Kerollmops/readme-badges
Add badges to the README
2018-12-19 14:42:54 +01:00
aa104fa253 doc: Add some funny badges to the README 2018-12-19 12:00:29 +01:00
55 changed files with 3846 additions and 2036 deletions

View File

@ -1,39 +1,61 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.1.0"
version = "0.2.1"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
bincode = "1.0"
byteorder = "1.2"
crossbeam = "0.6"
elapsed = "0.1"
fst = "0.3"
hashbrown = "0.1"
hashbrown = { version = "0.1", features = ["serde"] }
lazy_static = "1.1"
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
log = "0.4"
sdset = "0.3"
serde = "1.0"
serde_derive = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] }
unidecode = "0.3"
[dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"]
rev = "0372ba6"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "c2eb140"
rev = "306e201"
[dependencies.group-by]
git = "https://github.com/Kerollmops/group-by.git"
rev = "cab857b"
rev = "5a113fe"
[features]
default = ["simd"]
i128 = ["bincode/i128", "byteorder/i128"]
simd = ["rocksdb/sse"]
portable = ["rocksdb/portable"]
nightly = []
simd = ["rocksdb/sse"]
nightly = ["hashbrown/nightly", "group-by/nightly"]
[dev-dependencies]
csv = "1.0"
elapsed = "0.1"
env_logger = "0.6"
jemallocator = "0.1"
quickcheck = "0.8"
rand = "0.6"
rand_xorshift = "0.1"
structopt = "0.2"
tempfile = "3.0"
termcolor = "1.0"
warp = "0.1"
[dev-dependencies.chashmap]
git = "https://gitlab.redox-os.org/redox-os/tfs.git"
rev = "b3e7cae1"
[profile.release]
debug = true

View File

@ -1,47 +1,60 @@
# MeiliDB
[![Build Status](https://travis-ci.org/Kerollmops/MeiliDB.svg?branch=master)](https://travis-ci.org/Kerollmops/MeiliDB)
[![dependency status](https://deps.rs/repo/github/Kerollmops/MeiliDB/status.svg)](https://deps.rs/repo/github/Kerollmops/MeiliDB)
[![License](https://img.shields.io/github/license/Kerollmops/MeiliDB.svg)](https://github.com/Kerollmops/MeiliDB)
[![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-lightgray.svg)](
https://www.rust-lang.org)
A _full-text search database_ using a key-value store internally.
It uses [RocksDB](https://github.com/facebook/rocksdb) like a classic database, to store documents and internal data. The key-value store power allow us to handle updates and queries with small memory and CPU overheads.
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
You can [read the deep dive](deep-dive.md) if you want more informations on the engine, it describes the whole process of generating updates and handling queries.
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.
We will be proud if you send pull requests to help us grow this project, you can start with [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) to start !
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
At the moment this is a library only, this means that binaries are not part of this repository but since I'm still nice I have made some examples for you in the `examples/` folder that works with the data located in the `misc/` folder.
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.
In a near future MeiliDB we be a binary like any database: updated and queried using some kind of protocol. It is the final goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). MeiliDB will just be a bunch of network and protocols functions wrapping the library which itself will be published to https://crates.io, following the same update cycle.
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/Kerollmops/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
## Performances
_these informations have been made with a version dated of october 2018, we must update them_
With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.
We made some tests on remote machines and found that we can handle with a dataset of near 280k products, on a server that cost 5$/month with 1vCPU and 1GB of ram and on the same index and with a simple query:
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.
- near 190 users with an average response time of 90ms
- 150 users with an average response time of 70ms
- 100 users with an average response time of 45ms
Network is mesured, servers are located in amsterdam and tests are made between two different datacenters.
```
Running 10s test @ http://localhost:2230
2 threads and 12 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 18.86ms 49.39ms 614.89ms 95.23%
Req/Sec 620.41 59.53 790.00 65.00%
12359 requests in 10.00s, 3.26MB read
Requests/sec: 1235.54
Transfer/sec: 334.22KB
```
### Notes
The default Rust allocator has recently been [changed to use the system allocator](https://github.com/rust-lang/rust/pull/51241/).
We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).
## Usage and examples
MeiliDB work with an index like most of the search engines.
MeiliDB runs with an index like most search engines.
So to test the library you can create one by indexing a simple csv file.
```bash
cargo run --release --example create-database -- test.mdb misc/kaggle.csv
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
```
Once the command finished indexing the database should have been saved under the `test.mdb` folder.
Now you can easily run the `query-database` example to check what is stored in it.
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
```bash
cargo run --release --example query-database -- test.mdb
cargo run --release --example query-database -- test.mdb -n 10 id title
```

View File

@ -1,91 +1,132 @@
use std::collections::hash_map::DefaultHasher;
use std::path::{Path, PathBuf};
use std::hash::{Hash, Hasher};
use std::error::Error;
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::error::Error;
use std::borrow::Cow;
use std::fs::File;
use hashbrown::{HashMap, HashSet};
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED};
use meilidb::database::update::PositiveUpdateBuilder;
use meilidb::database::{Database, Schema, UpdateBuilder};
use meilidb::tokenizer::DefaultBuilder;
use meilidb::database::Database;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The csv file to index.
#[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words_path: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Debug, Serialize, Deserialize)]
struct Document<'a> {
id: &'a str,
title: &'a str,
description: &'a str,
image: &'a str,
}
#[derive(Serialize, Deserialize)]
struct Document<'a> (
#[serde(borrow)]
HashMap<Cow<'a, str>, Cow<'a, str>>
);
fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
fn create_schema() -> Schema {
let mut schema = SchemaBuilder::new();
schema.new_attribute("id", STORED);
schema.new_attribute("title", STORED | INDEXED);
schema.new_attribute("description", STORED | INDEXED);
schema.new_attribute("image", STORED);
schema.build()
}
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
let database = Database::create(database_path, schema.clone())?;
println!("start indexing...");
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = PositiveUpdateBuilder::new(update_path.path(), schema, tokenizer_builder);
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>>
{
let database = Database::create(database_path, &schema)?;
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
while rdr.read_record(&mut raw_record)? {
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(&document, &tokenizer_builder, &stop_words)?;
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
};
}
let document_id = calculate_hash(&document.id);
update.update(document_id, &document).unwrap();
println!();
println!("building update...");
let update = update.build()?;
println!("ingesting update...");
database.ingest_update_file(update)?;
}
let mut update = update.build()?;
update.set_move(true);
database.ingest_update_file(update)?;
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let schema = create_schema();
let schema = {
let file = File::open(&opt.schema_path)?;
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words_path {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path)
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
});
if let Err(e) = result {
@ -93,6 +134,5 @@ fn main() -> Result<(), Box<Error>> {
}
println!("database created in {} at: {:?}", elapsed, opt.database_path);
Ok(())
}

435
examples/http-server.rs Normal file
View File

@ -0,0 +1,435 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use log::{error, info};
use std::error::Error;
use std::ffi::OsStr;
use std::fmt;
use std::fs::{self, File};
use std::io::{self, BufRead, BufReader};
use std::net::SocketAddr;
use std::path::{PathBuf, Path};
use std::sync::Arc;
use std::time::SystemTime;
use hashbrown::{HashMap, HashSet};
use chashmap::CHashMap;
use chashmap::ReadGuard;
use elapsed::measure_time;
use meilidb::database::Database;
use meilidb::database::UpdateBuilder;
use meilidb::database::schema::Schema;
use meilidb::database::schema::SchemaBuilder;
use meilidb::tokenizer::DefaultBuilder;
use serde_derive::Deserialize;
use serde_derive::Serialize;
use structopt::StructOpt;
use warp::{Rejection, Filter};
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The address and port to bind the server to.
#[structopt(short = "l", default_value = "127.0.0.1:8080")]
pub listen_addr: SocketAddr,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words: PathBuf,
}
//
// ERRORS FOR THE MULTIDATABASE
//
#[derive(Debug)]
pub enum DatabaseError {
AlreadyExist,
NotExist,
NotFound(String),
Unknown(Box<Error>),
}
impl fmt::Display for DatabaseError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DatabaseError::AlreadyExist => write!(f, "File already exist"),
DatabaseError::NotExist => write!(f, "File not exist"),
DatabaseError::NotFound(ref name) => write!(f, "Database {} not found", name),
DatabaseError::Unknown(e) => write!(f, "{}", e),
}
}
}
impl Error for DatabaseError {}
impl From<Box<Error>> for DatabaseError {
fn from(e: Box<Error>) -> DatabaseError {
DatabaseError::Unknown(e)
}
}
//
// MULTIDATABASE DEFINITION
//
pub struct MultiDatabase {
databases: CHashMap<String, Database>,
db_path: PathBuf,
stop_words: HashSet<String>,
}
impl MultiDatabase {
pub fn new(path: PathBuf, stop_words: HashSet<String>) -> MultiDatabase {
MultiDatabase {
databases: CHashMap::new(),
db_path: path,
stop_words: stop_words
}
}
pub fn create(&self, name: String, schema: Schema) -> Result<(), DatabaseError> {
let rdb_name = format!("{}.mdb", name);
let database_path = self.db_path.join(rdb_name);
if database_path.exists() {
return Err(DatabaseError::AlreadyExist.into());
}
let index = Database::create(database_path, &schema)?;
self.databases.insert_new(name, index);
Ok(())
}
pub fn load(&self, name: String) -> Result<(), DatabaseError> {
let rdb_name = format!("{}.mdb", name);
let index_path = self.db_path.join(rdb_name);
if !index_path.exists() {
return Err(DatabaseError::NotExist.into());
}
let index = Database::open(index_path)?;
self.databases.insert_new(name, index);
Ok(())
}
pub fn load_existing(&self) {
let paths = match fs::read_dir(self.db_path.clone()){
Ok(p) => p,
Err(e) => {
error!("{}", e);
return
}
};
for path in paths {
let path = match path {
Ok(p) => p.path(),
Err(_) => continue
};
let path_str = match path.to_str() {
Some(p) => p,
None => continue
};
let extension = match get_extension_from_path(path_str) {
Some(e) => e,
None => continue
};
if extension != "mdb" {
continue
}
let name = match get_file_name_from_path(path_str) {
Some(f) => f,
None => continue
};
let db = match Database::open(path.clone()) {
Ok(db) => db,
Err(_) => continue
};
self.databases.insert_new(name.to_string(), db);
info!("Load database {}", name);
}
}
pub fn create_or_load(&self, name: String, schema: Schema) -> Result<(), DatabaseError> {
match self.create(name.clone(), schema) {
Err(DatabaseError::AlreadyExist) => self.load(name),
x => x,
}
}
pub fn get(&self, name: String) -> Result<ReadGuard<String, Database>, Box<Error>> {
Ok(self.databases.get(&name).ok_or(DatabaseError::NotFound(name))?)
}
}
fn get_extension_from_path(path: &str) -> Option<&str> {
Path::new(path).extension().and_then(OsStr::to_str)
}
fn get_file_name_from_path(path: &str) -> Option<&str> {
Path::new(path).file_stem().and_then(OsStr::to_str)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
//
// PARAMS & BODY FOR HTTPS HANDLERS
//
#[derive(Deserialize)]
struct CreateBody {
name: String,
schema: SchemaBuilder,
}
#[derive(Deserialize)]
struct IngestBody {
insert: Option<Vec<HashMap<String, String>>>,
delete: Option<Vec<HashMap<String, String>>>
}
#[derive(Serialize)]
struct IngestResponse {
inserted: usize,
deleted: usize
}
#[derive(Deserialize)]
struct SearchQuery {
q: String,
limit: Option<usize>,
}
//
// HTTP ROUTES
//
// Create a new index.
// The index name should be unused and the schema valid.
//
// POST /create
// Body:
// - name: String
// - schema: JSON
// - stopwords: Vec<String>
fn create(body: CreateBody, db: Arc<MultiDatabase>) -> Result<String, Rejection> {
let schema = body.schema.build();
match db.create(body.name.clone(), schema) {
Ok(_) => Ok(format!("{} created ", body.name)),
Err(e) => {
error!("{:?}", e);
return Err(warp::reject::not_found())
}
}
}
// Ingest new document.
// It's possible to have positive or/and negative updates.
//
// PUT /:name/ingest
// Body:
// - insert: Option<Vec<JSON>>
// - delete: Option<Vec<String>>
fn ingest(index_name: String, body: IngestBody, db: Arc<MultiDatabase>) -> Result<String, Rejection> {
let schema = {
let index = match db.get(index_name.clone()){
Ok(i) => i,
Err(_) => return Err(warp::reject::not_found()),
};
let view = index.view();
view.schema().clone()
};
let tokenizer_builder = DefaultBuilder::new();
let now = match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
Ok(n) => n.as_secs(),
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
};
let sst_name = format!("update-{}-{}.sst", index_name, now);
let sst_path = db.db_path.join(sst_name);
let mut response = IngestResponse{inserted: 0, deleted: 0};
let mut update = UpdateBuilder::new(sst_path, schema);
if let Some(documents) = body.delete {
for doc in documents {
if let Err(e) = update.remove_document(doc) {
error!("Impossible to remove document; {:?}", e);
} else {
response.deleted += 1;
}
}
}
let stop_words = &db.stop_words;
if let Some(documents) = body.insert {
for doc in documents {
if let Err(e) = update.update_document(doc, &tokenizer_builder, &stop_words) {
error!("Impossible to update document; {:?}", e);
} else {
response.inserted += 1;
}
}
}
let update = match update.build() {
Ok(u) => u,
Err(e) => {
error!("Impossible to create an update file; {:?}", e);
return Err(warp::reject::not_found())
}
};
{
let index = match db.get(index_name.clone()){
Ok(i) => i,
Err(_) => return Err(warp::reject::not_found()),
};
if let Err(e) = index.ingest_update_file(update) {
error!("Impossible to ingest sst file; {:?}", e);
return Err(warp::reject::not_found())
};
}
if let Ok(response) = serde_json::to_string(&response) {
return Ok(response);
};
return Err(warp::reject::not_found())
}
// Search in a specific index
// The default limit is 20
//
// GET /:name/search
// Params:
// - query: String
// - limit: Option<usize>
fn search(index_name: String, query: SearchQuery, db: Arc<MultiDatabase>) -> Result<String, Rejection> {
let view = {
let index = match db.get(index_name.clone()){
Ok(i) => i,
Err(_) => return Err(warp::reject::not_found()),
};
index.view()
};
let limit = query.limit.unwrap_or(20);
let query_builder = match view.query_builder() {
Ok(q) => q,
Err(_err) => return Err(warp::reject::not_found()),
};
let (time, responses) = measure_time(|| {
let docs = query_builder.query(&query.q, 0..limit);
let mut results: Vec<HashMap<String, String>> = Vec::with_capacity(limit);
for doc in docs {
match view.document_by_id(doc.id) {
Ok(val) => results.push(val),
Err(e) => println!("{:?}", e),
}
}
results
});
let response = match serde_json::to_string(&responses) {
Ok(val) => val,
Err(err) => format!("{:?}", err),
};
info!("index: {} - search: {:?} - limit: {} - time: {}", index_name, query.q, limit, time);
Ok(response)
}
fn start_server(listen_addr: SocketAddr, db: Arc<MultiDatabase>) {
let index_path = warp::path("index").and(warp::path::param::<String>());
let db = warp::any().map(move || db.clone());
let create_path = warp::path("create").and(warp::path::end());
let ingest_path = index_path.and(warp::path("ingest")).and(warp::path::end());
let search_path = index_path.and(warp::path("search")).and(warp::path::end());
let create = warp::post2()
.and(create_path)
.and(warp::body::json())
.and(db.clone())
.and_then(create);
let ingest = warp::put2()
.and(ingest_path)
.and(warp::body::json())
.and(db.clone())
.and_then(ingest);
let search = warp::get2()
.and(search_path)
.and(warp::query())
.and(db.clone())
.and_then(search);
let api = create
.or(ingest)
.or(search);
let logs = warp::log("server");
let headers = warp::reply::with::header("Content-Type", "application/json");
let routes = api.with(logs).with(headers);
info!("Server is started on {}", listen_addr);
warp::serve(routes).run(listen_addr);
}
fn main() {
env_logger::init();
let opt = Opt::from_args();
let stop_words = match retrieve_stop_words(&opt.stop_words) {
Ok(s) => s,
Err(_) => HashSet::new(),
};
let db = Arc::new(MultiDatabase::new(opt.database_path.clone(), stop_words));
db.load_existing();
start_server(opt.listen_addr, db);
}

View File

@ -1,11 +1,19 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry};
use std::iter::FromIterator;
use std::io::{self, Write};
use std::path::PathBuf;
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use hashbrown::{HashMap, HashSet};
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use structopt::StructOpt;
use meilidb::database::schema::SchemaAttr;
use meilidb::database::Database;
use meilidb::Match;
#[derive(Debug, StructOpt)]
pub struct Opt {
@ -13,20 +21,87 @@ pub struct Opt {
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize,
}
#[derive(Debug, Serialize, Deserialize)]
struct Document {
id: String,
title: String,
description: String,
image: String,
type Document = HashMap<String, String>;
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for match_ in matches {
let match_attribute = match_.attribute.attribute();
if SchemaAttr::new(match_attribute) == attribute {
let word_area = match_.word_area;
let char_index = word_area.char_index() as usize;
let char_length = word_area.length() as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
@ -41,26 +116,53 @@ fn main() -> Result<(), Box<Error>> {
io::stdout().flush()?;
if input.read_line(&mut buffer)? == 0 { break }
let query = buffer.trim_end_matches('\n');
let view = database.view();
let schema = view.schema();
let (elapsed, documents) = elapsed::measure_time(|| {
let builder = view.query_builder().unwrap();
builder.query(&buffer, 0..opt.number_results)
builder.query(query, 0..opt.number_results)
});
let mut full_documents = Vec::with_capacity(documents.len());
let number_of_documents = documents.len();
for doc in documents {
match view.document_by_id::<Document>(doc.id) {
Ok(document) => {
for name in &opt.displayed_fields {
let attr = match schema.attribute(name) {
Some(attr) => attr,
None => continue,
};
let text = match document.get(name) {
Some(text) => text,
None => continue,
};
for document in documents {
match view.retrieve_document::<Document>(document.id) {
Ok(document) => full_documents.push(document),
print!("{}: ", name);
let areas = create_highlight_areas(&text, &doc.matches, attr);
display_highlights(&text, &areas)?;
println!();
}
},
Err(e) => eprintln!("{}", e),
}
let mut matching_attributes = HashSet::new();
for _match in doc.matches {
let attr = SchemaAttr::new(_match.attribute.attribute());
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
}
println!("{:#?}", full_documents);
println!("Found {} results in {}", full_documents.len(), elapsed);
eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
buffer.clear();
}

View File

@ -0,0 +1,19 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
stored = true
[attributes.title]
stored = true
indexed = true
[attributes.description]
stored = true
indexed = true
[attributes.image]
stored = true

View File

@ -95,7 +95,8 @@ or
other
ought
our
ours ourselves
ours
ourselves
out
over
own

163
misc/fr.stopwords.txt Normal file
View File

@ -0,0 +1,163 @@
au
aux
avec
ce
ces
dans
de
des
du
elle
en
et
eux
il
je
la
le
leur
lui
ma
mais
me
même
mes
moi
mon
ne
nos
notre
nous
on
ou
par
pas
pour
qu
que
qui
sa
se
ses
son
sur
ta
te
tes
toi
ton
tu
un
une
vos
votre
vous
c
d
j
l
à
m
n
s
t
y
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
ceci
celà
cet
cette
ici
ils
les
leurs
quel
quels
quelle
quelles
sans
soi

105
src/attribute.rs Normal file
View File

@ -0,0 +1,105 @@
use std::fmt;
/// Represent an attribute number along with the word index
/// according to the tokenizer used.
///
/// It can accept up to 1024 attributes and word positions
/// can be maximum 2^22.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Attribute(u32);
impl Attribute {
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
pub(crate) fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
if attribute & 0b1111_1100_0000_0000 != 0 {
return Err(AttributeError::AttributeTooBig)
}
if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
return Err(AttributeError::IndexTooBig)
}
let attribute = u32::from(attribute) << 22;
Ok(Attribute(attribute | index))
}
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
///
/// # Panics
///
/// The attribute must not be greater than 1024
/// and the word index not greater than 2^22.
pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute {
match Attribute::new(attribute, index) {
Ok(attribute) => attribute,
Err(AttributeError::AttributeTooBig) => {
panic!("attribute must not be greater than 1024")
},
Err(AttributeError::IndexTooBig) => {
panic!("attribute word index must not be greater than 2^22")
},
}
}
pub(crate) fn max_value() -> Attribute {
Attribute(u32::max_value())
}
#[inline]
pub fn attribute(self) -> u16 {
(self.0 >> 22) as u16
}
#[inline]
pub fn word_index(self) -> u32 {
self.0 & 0b0000_0000_0011_1111_1111_1111_1111
}
}
impl fmt::Debug for Attribute {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Attribute")
.field("attribute", &self.attribute())
.field("word_index", &self.word_index())
.finish()
}
}
pub enum AttributeError {
AttributeTooBig,
IndexTooBig,
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck::{quickcheck, TestResult};
quickcheck! {
fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
return TestResult::discard()
}
let attribute = Attribute::new_faillible(gen_attr, gen_index);
let valid_attribute = attribute.attribute() == gen_attr;
let valid_index = attribute.word_index() == gen_index;
TestResult::from_bool(valid_attribute && valid_index)
}
fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
return TestResult::discard()
}
let a = Attribute::new_faillible(gen_attr, gen_index);
let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
TestResult::from_bool(a < b)
}
}
}

View File

@ -50,6 +50,7 @@ impl AutomatonExt for DfaExt {
}
}
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,

View File

@ -1,59 +1,54 @@
use std::io::{self, Cursor, BufRead};
use std::slice::from_raw_parts;
use std::error::Error;
use std::path::Path;
use std::sync::Arc;
use std::{io, mem};
use std::mem::size_of;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use fst::raw::MmapReadOnly;
use serde::ser::{Serialize, Serializer};
use crate::DocumentId;
use crate::data::Data;
use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Default, Clone)]
pub struct DocIds {
data: Data,
}
pub struct DocIds(SharedData);
impl DocIds {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?;
let data = Data::Mmap(mmap);
Ok(DocIds { data })
pub fn new(ids: &Set<DocumentId>) -> DocIds {
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
let data = SharedData::from_bytes(bytes.to_vec());
DocIds(data)
}
pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> {
// FIXME check if modulo DocumentId
let len = vec.len();
let data = Data::Shared {
bytes: Arc::new(vec),
offset: 0,
len: len
};
Ok(DocIds { data })
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let doc_ids = cursor.get_ref().range(offset, len);
cursor.consume(len);
Ok(DocIds(doc_ids))
}
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let len = self.0.len() as u64;
bytes.write_u64::<LittleEndian>(len).unwrap();
bytes.extend_from_slice(&self.0);
}
pub fn contains(&self, doc: DocumentId) -> bool {
// FIXME prefer using the sdset::exponential_search function
self.doc_ids().binary_search(&doc).is_ok()
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn doc_ids(&self) -> &Set<DocumentId> {
let slice = &self.data;
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl AsRef<Set<DocumentId>> for DocIds {
fn as_ref(&self) -> &Set<DocumentId> {
let slice = &self.0;
let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / mem::size_of::<DocumentId>();
let len = slice.len() / size_of::<DocumentId>();
let slice = unsafe { from_raw_parts(ptr, len) };
Set::new_unchecked(slice)
}
}
impl Serialize for DocIds {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.data.as_ref().serialize(serializer)
}
}

View File

@ -1,16 +1,15 @@
use std::io::{self, Write, Cursor, BufRead};
use std::slice::from_raw_parts;
use std::io::{self, Write};
use std::mem::size_of;
use std::ops::Index;
use std::path::Path;
use std::sync::Arc;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::raw::MmapReadOnly;
use sdset::Set;
use crate::DocIndex;
use crate::data::Data;
use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Debug)]
#[repr(C)]
@ -21,52 +20,45 @@ struct Range {
#[derive(Clone, Default)]
pub struct DocIndexes {
ranges: Data,
indexes: Data,
ranges: SharedData,
indexes: SharedData,
}
impl DocIndexes {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?;
DocIndexes::from_data(Data::Mmap(mmap))
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
let bytes = Arc::new(bytes);
let len = bytes.len();
let data = SharedData::new(bytes, 0, len);
let mut cursor = Cursor::new(data);
DocIndexes::from_cursor(&mut cursor)
}
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
let len = vec.len();
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let ranges = cursor.get_ref().range(offset, len);
cursor.consume(len);
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
let data = Data::Shared { bytes, offset, len };
DocIndexes::from_data(data)
}
fn from_data(data: Data) -> io::Result<Self> {
let ranges_len_offset = data.len() - size_of::<u64>();
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize;
let ranges_offset = ranges_len_offset - ranges_len;
let ranges = data.range(ranges_offset, ranges_len);
let indexes = data.range(0, ranges_offset);
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let indexes = cursor.get_ref().range(offset, len);
cursor.consume(len);
Ok(DocIndexes { ranges, indexes })
}
pub fn to_vec(&self) -> Vec<u8> {
let capacity = self.indexes.len() + self.ranges.len() + size_of::<u64>();
let mut bytes = Vec::with_capacity(capacity);
bytes.extend_from_slice(&self.indexes);
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let ranges_len = self.ranges.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
bytes.extend_from_slice(&self.ranges);
bytes.write_u64::<LittleEndian>(self.ranges.len() as u64).unwrap();
bytes
let indexes_len = self.indexes.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
bytes.extend_from_slice(&self.indexes);
}
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
self.ranges().get(index as usize).map(|Range { start, end }| {
self.ranges().get(index).map(|Range { start, end }| {
let start = *start as usize;
let end = *end as usize;
let slice = &self.indexes()[start..end];
@ -102,12 +94,17 @@ impl Index<usize> for DocIndexes {
pub struct DocIndexesBuilder<W> {
ranges: Vec<Range>,
indexes: Vec<DocIndex>,
wtr: W,
}
impl DocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
DocIndexesBuilder::new(Vec::new())
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: Vec::new(),
}
}
}
@ -115,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
// write the values
let indexes = unsafe { into_u8_slice(indexes) };
self.wtr.write_all(indexes)
self.indexes.extend_from_slice(indexes);
}
pub fn finish(self) -> io::Result<()> {
@ -135,40 +131,52 @@ impl<W: Write> DocIndexesBuilder<W> {
}
pub fn into_inner(mut self) -> io::Result<W> {
// write the ranges
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
self.wtr.write_all(ranges)?;
// write the length of the ranges
let ranges = unsafe { into_u8_slice(&self.ranges) };
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(ranges)?;
let indexes = unsafe { into_u8_slice(&self.indexes) };
let len = indexes.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(indexes)?;
Ok(self.wtr)
}
}
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
use crate::{Attribute, WordArea};
use crate::DocumentId;
#[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?;
builder.insert(Set::new(&[a, b, c])?)?;
builder.insert(Set::new(&[a, c])?)?;
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(bytes)?;
@ -183,19 +191,33 @@ mod tests {
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?;
builder.insert(Set::new(&[a, b, c])?)?;
builder.insert(Set::new(&[a, c])?)?;
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let builder_bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
let bytes = docs.to_vec();
let mut bytes = Vec::new();
docs.write_to_bytes(&mut bytes);
assert_eq!(builder_bytes, bytes);

View File

@ -1,51 +1,43 @@
mod doc_ids;
mod doc_indexes;
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::ops::Deref;
use std::sync::Arc;
use fst::raw::MmapReadOnly;
pub use self::doc_ids::DocIds;
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
#[derive(Clone)]
enum Data {
Shared {
bytes: Arc<Vec<u8>>,
offset: usize,
len: usize,
},
Mmap(MmapReadOnly),
#[derive(Default, Clone)]
pub struct SharedData {
pub bytes: Arc<Vec<u8>>,
pub offset: usize,
pub len: usize,
}
impl Data {
pub fn range(&self, off: usize, l: usize) -> Data {
match self {
Data::Shared { bytes, offset, len } => {
assert!(off + l <= *len);
Data::Shared {
bytes: bytes.clone(),
offset: offset + off,
len: l,
}
},
Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)),
impl SharedData {
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
let len = vec.len();
let bytes = Arc::new(vec);
SharedData::new(bytes, 0, len)
}
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
SharedData { bytes, offset, len }
}
pub fn range(&self, offset: usize, len: usize) -> SharedData {
assert!(offset + len <= self.len);
SharedData {
bytes: self.bytes.clone(),
offset: self.offset + offset,
len: len,
}
}
}
impl Default for Data {
fn default() -> Data {
Data::Shared {
bytes: Arc::default(),
offset: 0,
len: 0,
}
}
}
impl Deref for Data {
impl Deref for SharedData {
type Target = [u8];
fn deref(&self) -> &Self::Target {
@ -53,13 +45,14 @@ impl Deref for Data {
}
}
impl AsRef<[u8]> for Data {
impl AsRef<[u8]> for SharedData {
fn as_ref(&self) -> &[u8] {
match self {
Data::Shared { bytes, offset, len } => {
&bytes[*offset..offset + len]
},
Data::Mmap(m) => m.as_slice(),
}
&self.bytes[self.offset..self.offset + self.len]
}
}
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}

View File

@ -1,110 +0,0 @@
mod ops;
pub mod positive;
pub mod negative;
pub use self::positive::{PositiveBlob, PositiveBlobBuilder};
pub use self::negative::NegativeBlob;
pub use self::ops::OpBuilder;
use std::fmt;
use serde_derive::{Serialize, Deserialize};
use serde::ser::{Serialize, Serializer, SerializeTuple};
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
#[derive(Debug)]
pub enum Blob {
Positive(PositiveBlob),
Negative(NegativeBlob),
}
impl Blob {
pub fn is_negative(&self) -> bool {
self.sign() == Sign::Negative
}
pub fn is_positive(&self) -> bool {
self.sign() == Sign::Positive
}
pub fn sign(&self) -> Sign {
match self {
Blob::Positive(_) => Sign::Positive,
Blob::Negative(_) => Sign::Negative,
}
}
}
impl Serialize for Blob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
match self {
Blob::Positive(blob) => {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&Sign::Positive)?;
tuple.serialize_element(&blob)?;
tuple.end()
},
Blob::Negative(blob) => {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&Sign::Negative)?;
tuple.serialize_element(&blob)?;
tuple.end()
},
}
}
}
impl<'de> Deserialize<'de> for Blob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Blob, D::Error> {
struct TupleVisitor;
impl<'de> Visitor<'de> for TupleVisitor {
type Value = Blob;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a Blob struct")
}
#[inline]
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
let sign = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(0, &self)),
};
match sign {
Sign::Positive => {
let blob = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(Blob::Positive(blob))
},
Sign::Negative => {
let blob = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(Blob::Negative(blob))
},
}
}
}
deserializer.deserialize_tuple(2, TupleVisitor)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Sign {
Positive,
Negative,
}
impl Sign {
pub fn invert(self) -> Sign {
match self {
Sign::Positive => Sign::Negative,
Sign::Negative => Sign::Positive,
}
}
}

View File

@ -1,67 +0,0 @@
use std::error::Error;
use std::path::Path;
use std::fmt;
use sdset::Set;
use serde::de::{self, Deserialize, Deserializer};
use serde::ser::{Serialize, Serializer};
use crate::data::DocIds;
use crate::DocumentId;
#[derive(Default)]
pub struct NegativeBlob {
doc_ids: DocIds,
}
impl NegativeBlob {
pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
{
let doc_ids = DocIds::from_path(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
let doc_ids = DocIds::from_bytes(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn from_raw(doc_ids: DocIds) -> Self {
NegativeBlob { doc_ids }
}
pub fn as_ids(&self) -> &DocIds {
&self.doc_ids
}
pub fn into_doc_ids(self) -> DocIds {
self.doc_ids
}
}
impl AsRef<Set<DocumentId>> for NegativeBlob {
fn as_ref(&self) -> &Set<DocumentId> {
self.as_ids().doc_ids()
}
}
impl fmt::Debug for NegativeBlob {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "NegativeBlob(")?;
f.debug_list().entries(self.as_ref().as_slice()).finish()?;
write!(f, ")")
}
}
impl Serialize for NegativeBlob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.doc_ids.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for NegativeBlob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<NegativeBlob, D::Error> {
let bytes = Vec::deserialize(deserializer)?;
NegativeBlob::from_bytes(bytes).map_err(de::Error::custom)
}
}

View File

@ -1,5 +0,0 @@
mod blob;
mod ops;
pub use self::blob::NegativeBlob;
pub use self::ops::OpBuilder;

View File

@ -1,73 +0,0 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::Set;
use crate::database::blob::NegativeBlob;
use crate::data::DocIds;
use crate::DocumentId;
pub struct OpBuilder<'a> {
inner: SdOpBuilder<'a, DocumentId>,
}
/// Do a set operation on multiple negative blobs.
impl<'a> OpBuilder<'a> {
pub fn new() -> Self {
Self { inner: SdOpBuilder::new() }
}
pub fn with_capacity(cap: usize) -> Self {
Self { inner: SdOpBuilder::with_capacity(cap) }
}
pub fn add(mut self, blob: &'a NegativeBlob) -> Self {
self.push(blob);
self
}
pub fn push(&mut self, blob: &'a NegativeBlob) {
let set = Set::new_unchecked(blob.as_ref());
self.inner.push(set);
}
pub fn union(self) -> Union<'a> {
Union::new(self.inner.union())
}
pub fn intersection(self) -> Intersection<'a> {
Intersection::new(self.inner.intersection())
}
pub fn difference(self) -> Difference<'a> {
Difference::new(self.inner.difference())
}
pub fn symmetric_difference(self) -> SymmetricDifference<'a> {
SymmetricDifference::new(self.inner.symmetric_difference())
}
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'a> {
op: sdset::multi::$name<'a, DocumentId>,
}
impl<'a> $name<'a> {
fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self {
$name { op }
}
pub fn into_negative_blob(self) -> NegativeBlob {
let document_ids = sdset::SetOperation::into_set_buf(self.op);
let doc_ids = DocIds::from_document_ids(document_ids.into_vec());
NegativeBlob::from_raw(doc_ids)
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);

View File

@ -1,109 +0,0 @@
use std::error::Error;
use fst::{IntoStreamer, Streamer};
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use group_by::GroupBy;
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
use crate::database::blob::{positive, negative};
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
a.sign() == b.sign()
}
fn unwrap_positive(blob: &Blob) -> &PositiveBlob {
match blob {
Blob::Positive(blob) => blob,
Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"),
}
}
fn unwrap_negative(blob: &Blob) -> &NegativeBlob {
match blob {
Blob::Negative(blob) => blob,
Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"),
}
}
pub struct OpBuilder {
blobs: Vec<Blob>,
}
impl OpBuilder {
pub fn new() -> OpBuilder {
OpBuilder { blobs: Vec::new() }
}
pub fn with_capacity(cap: usize) -> OpBuilder {
OpBuilder { blobs: Vec::with_capacity(cap) }
}
pub fn push(&mut self, blob: Blob) {
if self.blobs.is_empty() && blob.is_negative() { return }
self.blobs.push(blob);
}
pub fn merge(self) -> Result<PositiveBlob, Box<Error>> {
let groups = GroupBy::new(&self.blobs, blob_same_sign);
let mut aggregated = Vec::new();
for blobs in groups {
match blobs[0].sign() {
Sign::Positive => {
let mut op_builder = positive::OpBuilder::with_capacity(blobs.len());
for blob in blobs {
op_builder.push(unwrap_positive(blob));
}
let mut stream = op_builder.union().into_stream();
let mut builder = PositiveBlobBuilder::memory();
while let Some((input, doc_indexes)) = stream.next() {
// FIXME empty doc_indexes must be handled by OpBuilder
if !doc_indexes.is_empty() {
builder.insert(input, doc_indexes).unwrap();
}
}
let (map, doc_indexes) = builder.into_inner().unwrap();
let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap();
aggregated.push(Blob::Positive(blob));
},
Sign::Negative => {
let mut op_builder = negative::OpBuilder::with_capacity(blobs.len());
for blob in blobs {
op_builder.push(unwrap_negative(blob));
}
let blob = op_builder.union().into_negative_blob();
aggregated.push(Blob::Negative(blob));
},
}
}
let mut buffer = Vec::new();
aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| {
let negative = NegativeBlob::default();
let (positive, negative) = match slice {
[a, b] => (unwrap_positive(a), unwrap_negative(b)),
[a] => (unwrap_positive(a), &negative),
_ => unreachable!(),
};
let mut builder = PositiveBlobBuilder::memory();
let op_builder = positive::OpBuilder::new().add(&base).add(&positive);
let mut stream = op_builder.union().into_stream();
while let Some((input, doc_indexes)) = stream.next() {
let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x);
buffer.clear();
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
builder.insert(input, Set::new_unchecked(&buffer))?;
}
}
let (map, doc_indexes) = builder.into_inner()?;
PositiveBlob::from_bytes(map, doc_indexes)
})
}
}

View File

@ -1,254 +0,0 @@
use std::fmt;
use std::io::Write;
use std::path::Path;
use std::error::Error;
use fst::{map, Map, Streamer, IntoStreamer};
use sdset::Set;
use crate::DocIndex;
use crate::data::{DocIndexes, DocIndexesBuilder};
use serde::ser::{Serialize, Serializer, SerializeTuple};
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
#[derive(Default)]
pub struct PositiveBlob {
map: Map,
indexes: DocIndexes,
}
impl PositiveBlob {
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
Q: AsRef<Path>,
{
let map = Map::from_path(map)?;
let indexes = DocIndexes::from_path(indexes)?;
Ok(PositiveBlob { map, indexes })
}
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(PositiveBlob { map, indexes })
}
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
PositiveBlob { map, indexes }
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
self.map.get(key).map(|index| &self.indexes[index as usize])
}
pub fn as_map(&self) -> &Map {
&self.map
}
pub fn as_indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn explode(self) -> (Map, DocIndexes) {
(self.map, self.indexes)
}
}
impl fmt::Debug for PositiveBlob {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "PositiveBlob([")?;
let mut stream = self.into_stream();
let mut first = true;
while let Some((k, v)) = stream.next() {
if !first {
write!(f, ", ")?;
}
first = false;
write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?;
}
write!(f, "])")
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob {
type Item = (&'a [u8], &'a [DocIndex]);
/// The type of the stream to be constructed.
type Into = PositiveBlobStream<'m>;
/// Construct a stream from `Self`.
fn into_stream(self) -> Self::Into {
PositiveBlobStream {
map_stream: self.map.into_stream(),
doc_indexes: &self.indexes,
}
}
}
pub struct PositiveBlobStream<'m> {
map_stream: map::Stream<'m>,
doc_indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> {
type Item = (&'a [u8], &'a [DocIndex]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let doc_indexes = &self.doc_indexes[index as usize];
Some((input, doc_indexes))
},
None => None,
}
}
}
impl Serialize for PositiveBlob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&self.map.as_fst().to_vec())?;
tuple.serialize_element(&self.indexes.to_vec())?;
tuple.end()
}
}
impl<'de> Deserialize<'de> for PositiveBlob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<PositiveBlob, D::Error> {
struct TupleVisitor;
impl<'de> Visitor<'de> for TupleVisitor {
type Value = PositiveBlob;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a PositiveBlob struct")
}
#[inline]
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
let map = match seq.next_element()? {
Some(bytes) => match Map::from_bytes(bytes) {
Ok(value) => value,
Err(err) => return Err(de::Error::custom(err)),
},
None => return Err(de::Error::invalid_length(0, &self)),
};
let indexes = match seq.next_element()? {
Some(bytes) => match DocIndexes::from_bytes(bytes) {
Ok(value) => value,
Err(err) => return Err(de::Error::custom(err)),
},
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(PositiveBlob { map, indexes })
}
}
deserializer.deserialize_tuple(2, TupleVisitor)
}
}
pub struct PositiveBlobBuilder<W, X> {
map: fst::MapBuilder<W>,
indexes: DocIndexesBuilder<X>,
value: u64,
}
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
PositiveBlobBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
}
impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
pub fn new(map: W, indexes: X) -> Result<Self, Box<Error>> {
Ok(PositiveBlobBuilder {
map: fst::MapBuilder::new(map)?,
indexes: DocIndexesBuilder::new(indexes),
value: 0,
})
}
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, doc_indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(doc_indexes)?;
self.value += 1;
Ok(())
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(drop)
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
let map = self.map.into_inner()?;
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = PositiveBlobBuilder::memory();
builder.insert("aaa", Set::new(&[a])?)?;
builder.insert("aab", Set::new(&[a, b, c])?)?;
builder.insert("aac", Set::new(&[a, c])?)?;
let (map_bytes, indexes_bytes) = builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
assert_eq!(positive_blob.get("aad"), None);
Ok(())
}
#[test]
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = PositiveBlobBuilder::memory();
builder.insert("aaa", Set::new(&[a])?)?;
builder.insert("aab", Set::new(&[a, b, c])?)?;
builder.insert("aac", Set::new(&[a, c])?)?;
let (map_bytes, indexes_bytes) = builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
let bytes = bincode::serialize(&positive_blob)?;
let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?;
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
assert_eq!(positive_blob.get("aad"), None);
Ok(())
}
}

View File

@ -1,5 +0,0 @@
mod blob;
mod ops;
pub use self::blob::{PositiveBlob, PositiveBlobBuilder};
pub use self::ops::OpBuilder;

View File

@ -1,128 +0,0 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::{SetOperation, Set};
use crate::database::blob::PositiveBlob;
use crate::data::DocIndexes;
use crate::DocIndex;
pub struct OpBuilder<'m> {
// the operation on the maps is always an union.
map_op: fst::map::OpBuilder<'m>,
indexes: Vec<&'m DocIndexes>,
}
/// Do a set operation on multiple positive blobs.
impl<'m> OpBuilder<'m> {
pub fn new() -> Self {
Self {
map_op: fst::map::OpBuilder::new(),
indexes: Vec::new(),
}
}
pub fn with_capacity(cap: usize) -> Self {
Self {
map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity
indexes: Vec::with_capacity(cap),
}
}
pub fn add(mut self, blob: &'m PositiveBlob) -> Self {
self.push(blob);
self
}
pub fn push(&mut self, blob: &'m PositiveBlob) {
self.map_op.push(blob.as_map());
self.indexes.push(blob.as_indexes());
}
pub fn union(self) -> Union<'m> {
Union::new(self.map_op.union(), self.indexes)
}
pub fn intersection(self) -> Intersection<'m> {
Intersection::new(self.map_op.union(), self.indexes)
}
pub fn difference(self) -> Difference<'m> {
Difference::new(self.map_op.union(), self.indexes)
}
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
SymmetricDifference::new(self.map_op.union(), self.indexes)
}
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'m> {
stream: fst::map::Union<'m>,
indexes: Vec<&'m DocIndexes>,
outs: Vec<DocIndex>,
}
impl<'m> $name<'m> {
fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self {
$name {
stream: stream,
indexes: indexes,
outs: Vec::new(),
}
}
}
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
// loop {
// let (input, ivalues) = match self.stream.next() {
// Some(value) => value,
// None => return None,
// };
// self.outs.clear();
// let mut builder = SdOpBuilder::with_capacity(ivalues.len());
// for ivalue in ivalues {
// let indexes = self.indexes[ivalue.index];
// let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes");
// let set = Set::new_unchecked(indexes);
// builder.push(set);
// }
// builder.$operation().extend_vec(&mut self.outs);
// if self.outs.is_empty() { continue }
// return Some((input, &self.outs))
// }
// FIXME make the above code compile
match self.stream.next() {
Some((input, ivalues)) => {
self.outs.clear();
let mut builder = SdOpBuilder::with_capacity(ivalues.len());
for ivalue in ivalues {
let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize];
let set = Set::new_unchecked(doc_indexes);
builder.push(set);
}
builder.$operation().extend_vec(&mut self.outs);
if self.outs.is_empty() { return None }
return Some((input, Set::new_unchecked(&self.outs)))
},
None => None
}
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);

View File

@ -2,13 +2,13 @@ use std::io::{Cursor, Read, Write};
use std::mem::size_of;
use std::fmt;
use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt};
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
use crate::database::schema::SchemaAttr;
use crate::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u32>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
#[derive(Copy, Clone)]
pub struct DocumentKey([u8; DOC_KEY_LEN]);
@ -19,7 +19,7 @@ impl DocumentKey {
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(b"doc-").unwrap();
wtr.write_u64::<NativeEndian>(id).unwrap();
wtr.write_u64::<BigEndian>(id.0).unwrap();
DocumentKey(buffer)
}
@ -43,7 +43,8 @@ impl DocumentKey {
}
pub fn document_id(&self) -> DocumentId {
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
}
@ -72,11 +73,19 @@ impl DocumentKeyAttr {
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap();
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap();
wtr.write_u16::<BigEndian>(attr.0).unwrap();
DocumentKeyAttr(buffer)
}
pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::min())
}
pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::max())
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
assert_eq!(&bytes[..4], b"doc-");
@ -88,12 +97,13 @@ impl DocumentKeyAttr {
}
pub fn document_id(&self) -> DocumentId {
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap();
let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
SchemaAttr::new(value)
}
@ -112,7 +122,24 @@ impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id())
.field("attribute", &self.attribute().as_u32())
.field("attribute", &self.attribute().0)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn keep_as_ref_order() {
for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
let id = DocumentId(0);
let a = DocumentKeyAttr::new(id, SchemaAttr(a));
let b = DocumentKeyAttr::new(id, SchemaAttr(b));
assert!(a < b);
assert!(a.as_ref() < b.as_ref());
}
}
}

82
src/database/index/mod.rs Normal file
View File

@ -0,0 +1,82 @@
mod negative;
mod positive;
pub(crate) use self::negative::Negative;
pub(crate) use self::positive::{Positive, PositiveBuilder};
use std::error::Error;
use std::io::Cursor;
use std::sync::Arc;
use fst::{IntoStreamer, Streamer};
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use fst::Map;
use crate::data::{SharedData, DocIndexes};
#[derive(Default)]
pub struct Index {
pub(crate) negative: Negative,
pub(crate) positive: Positive,
}
impl Index {
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
let len = bytes.len();
Index::from_shared_bytes(Arc::new(bytes), 0, len)
}
pub fn from_shared_bytes(
bytes: Arc<Vec<u8>>,
offset: usize,
len: usize,
) -> Result<Index, Box<Error>>
{
let data = SharedData::new(bytes, offset, len);
let mut cursor = Cursor::new(data);
let negative = Negative::from_cursor(&mut cursor)?;
let positive = Positive::from_cursor(&mut cursor)?;
Ok(Index { negative, positive })
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
self.negative.write_to_bytes(bytes);
self.positive.write_to_bytes(bytes);
}
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
if other.negative.is_empty() {
let negative = Negative::default();
let positive = self.positive.union(&other.positive)?;
return Ok(Index { negative, positive })
}
let mut buffer = Vec::new();
let mut builder = PositiveBuilder::memory();
let mut stream = self.positive.into_stream();
while let Some((key, indexes)) = stream.next() {
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
buffer.clear();
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes)?;
}
}
let positive = {
let (map, indexes) = builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Positive::new(map, indexes)
};
let negative = Negative::default();
let positive = positive.union(&other.positive)?;
Ok(Index { negative, positive })
}
}

View File

@ -0,0 +1,43 @@
use std::error::Error;
use std::io::Cursor;
use std::ops::Deref;
use sdset::Set;
use byteorder::{LittleEndian, WriteBytesExt};
use crate::data::SharedData;
use crate::data::DocIds;
use crate::DocumentId;
#[derive(Default)]
pub struct Negative(DocIds);
impl Negative {
pub fn new(doc_ids: DocIds) -> Negative {
Negative(doc_ids)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
let doc_ids = DocIds::from_cursor(cursor)?;
Ok(Negative(doc_ids))
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.0.as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl Deref for Negative {
type Target = Set<DocumentId>;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}

View File

@ -0,0 +1,166 @@
use std::io::{Write, BufRead, Cursor};
use std::error::Error;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::{map, Map, Streamer, IntoStreamer};
use sdset::{Set, SetOperation};
use sdset::duo::Union;
use fst::raw::Fst;
use crate::data::{DocIndexes, DocIndexesBuilder};
use crate::data::SharedData;
use crate::DocIndex;
#[derive(Default)]
pub struct Positive {
map: Map,
indexes: DocIndexes,
}
impl Positive {
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
Positive { map, indexes }
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let data = cursor.get_ref().range(offset, len);
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
let map = Map::from(fst);
cursor.consume(len);
let indexes = DocIndexes::from_cursor(cursor)?;
Ok(Positive { map, indexes})
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.map.as_fst().as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
self.indexes.write_to_bytes(bytes);
}
pub fn map(&self) -> &Map {
&self.map
}
pub fn indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
let mut builder = PositiveBuilder::memory();
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
let mut buffer = Vec::new();
while let Some((key, ivalues)) = stream.next() {
buffer.clear();
match ivalues {
[a, b] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
let a = Set::new_unchecked(indexes);
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
let b = Set::new_unchecked(indexes);
let op = Union::new(a, b);
op.extend_vec(&mut buffer);
},
[a] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
buffer.extend_from_slice(indexes)
},
_ => continue,
}
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes)?;
}
}
let (map, indexes) = builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(Positive { map, indexes })
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
type Item = (&'a [u8], &'a Set<DocIndex>);
/// The type of the stream to be constructed.
type Into = Stream<'m>;
/// Construct a stream from `Self`.
fn into_stream(self) -> Self::Into {
Stream {
map_stream: self.map.into_stream(),
indexes: &self.indexes,
}
}
}
pub struct Stream<'m> {
map_stream: map::Stream<'m>,
indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for Stream<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let indexes = &self.indexes[index as usize];
let indexes = Set::new_unchecked(indexes);
Some((input, indexes))
},
None => None,
}
}
}
pub struct PositiveBuilder<W, X> {
map: fst::MapBuilder<W>,
indexes: DocIndexesBuilder<X>,
value: u64,
}
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
PositiveBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
}
impl<W: Write, X: Write> PositiveBuilder<W, X> {
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(indexes);
self.value += 1;
Ok(())
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
let map = self.map.into_inner()?;
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
}
}

View File

@ -1,45 +1,81 @@
use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard};
use std::sync::{Arc, Mutex};
use std::error::Error;
use std::path::Path;
use std::ops::Deref;
use std::path::Path;
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, DBVector, MergeOperands};
use crossbeam::atomic::ArcCell;
use log::info;
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::database_view::{DatabaseView, DocumentIter};
use self::blob::positive::PositiveBlob;
use self::update::Update;
use self::schema::Schema;
use self::blob::Blob;
pub mod blob;
pub mod schema;
pub mod update;
mod document_key;
mod database_view;
mod deserializer;
pub use self::view::{DatabaseView, DocumentIter};
pub use self::update::{Update, UpdateBuilder};
pub use self::serde::SerializerError;
pub use self::schema::Schema;
pub use self::index::Index;
const DATA_INDEX: &[u8] = b"data-index";
const DATA_SCHEMA: &[u8] = b"data-schema";
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
pub mod schema;
pub(crate) mod index;
mod deserializer;
mod document_key;
mod serde;
mod update;
mod view;
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from(&*vector)?),
Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
None => Err(String::from("BUG: no schema found in the database").into()),
}
}
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_INDEX)? {
Some(vector) => Ok(bincode::deserialize(&*vector)?),
None => Ok(PositiveBlob::default()),
let (elapsed, vector) = elapsed::measure_time(|| snapshot.get(DATA_INDEX));
info!("loading index from kv-store took {}", elapsed);
let index = match vector? {
Some(vector) => {
let bytes = vector.as_ref().to_vec();
info!("index size if {} MiB", bytes.len() / 1024 / 1024);
let (elapsed, index) = elapsed::measure_time(|| Index::from_bytes(bytes));
info!("loading index from bytes took {}", elapsed);
index?
},
None => Index::default(),
};
Ok(index)
}
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
let mut index: Option<Index> = None;
for bytes in existing.into_iter().chain(operands) {
let operand = Index::from_bytes(bytes.to_vec()).unwrap();
let merged = match index {
Some(ref index) => index.merge(&operand).unwrap(),
None => operand,
};
index.replace(merged);
}
let index = index.unwrap_or_default();
let mut bytes = Vec::new();
index.write_to_bytes(&mut bytes);
bytes
}
pub struct Database {
@ -49,11 +85,11 @@ pub struct Database {
db: Mutex<Arc<DB>>,
// This view is updated each time the DB ingests an update
view: RwLock<DatabaseView<Arc<DB>>>,
view: ArcCell<DatabaseView<Arc<DB>>>,
}
impl Database {
pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Database, Box<Error>> {
pub fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<Database, Box<Error>> {
let path = path.as_ref();
if path.exists() {
return Err(format!("File already exists at path: {}, cannot create database.",
@ -71,12 +107,12 @@ impl Database {
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
let mut schema_bytes = Vec::new();
schema.write_to(&mut schema_bytes)?;
schema.write_to_bin(&mut schema_bytes)?;
db.put(DATA_SCHEMA, &schema_bytes)?;
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = RwLock::new(DatabaseView::new(snapshot)?);
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(Database { db: Mutex::new(db), view })
}
@ -94,18 +130,18 @@ impl Database {
// FIXME create a generic function to do that !
let _schema = match db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from(&*value)?,
Some(value) => Schema::read_from_bin(&*value)?,
None => return Err(String::from("Database does not contain a schema").into()),
};
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = RwLock::new(DatabaseView::new(snapshot)?);
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(Database { db: Mutex::new(db), view })
}
pub fn ingest_update_file(&self, update: Update) -> Result<(), Box<Error>> {
pub fn ingest_update_file(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let snapshot = {
// We must have a mutex here to ensure that update ingestions and compactions
// are done atomatically and in the right order.
@ -116,32 +152,24 @@ impl Database {
Err(e) => return Err(e.to_string().into()),
};
let move_update = update.can_be_moved();
let path = update.into_path_buf();
let path = path.to_string_lossy();
let path = update.path().to_string_lossy();
let options = IngestExternalFileOptions::new();
// options.move_files(move_update);
let mut options = IngestExternalFileOptions::new();
options.move_files(move_update);
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
// Compacting to trigger the merge operator only one time
// while ingesting the update and not each time searching
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
let (elapsed, result) = elapsed::measure_time(|| {
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])
});
let _ = result?;
info!("ingesting update file took {}", elapsed);
Snapshot::new(db.clone())
};
// Here we will block the view creation for the minimum amount of time:
// updating the DatabaseView itself with the new database snapshot
let view = DatabaseView::new(snapshot)?;
match self.view.write() {
Ok(mut lock) => *lock = view,
Err(e) => return Err(e.to_string().into()),
}
let view = Arc::new(DatabaseView::new(snapshot)?);
self.view.set(view.clone());
Ok(())
Ok(view)
}
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
@ -155,105 +183,508 @@ impl Database {
}
}
pub fn view(&self) -> RwLockReadGuard<DatabaseView<Arc<DB>>> {
self.view.read().unwrap()
pub fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
self.view.get()
}
}
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
if key != DATA_INDEX {
panic!("The merge operator only supports \"data-index\" merging")
}
let capacity = {
let remaining = operands.size_hint().0;
let already_exist = usize::from(existing_value.is_some());
remaining + already_exist
};
let mut op = blob::OpBuilder::with_capacity(capacity);
if let Some(existing_value) = existing_value {
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
op.push(Blob::Positive(blob));
}
for bytes in operands {
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
op.push(blob);
}
let blob = op.merge().expect("BUG: could not merge blobs");
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use hashbrown::HashSet;
use tempfile::tempdir;
use crate::tokenizer::DefaultBuilder;
use crate::database::update::PositiveUpdateBuilder;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
use crate::database::update::UpdateBuilder;
use crate::tokenizer::DefaultBuilder;
#[test]
fn ingest_update_file() -> Result<(), Box<Error>> {
fn ingest_one_update_file() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&rocksdb_path, schema.clone())?;
let tokenizer_builder = DefaultBuilder::new();
let database = Database::create(&rocksdb_path, &schema)?;
let update_path = dir.path().join("update.sst");
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let mut update = {
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
let docid0;
let docid1;
let update = {
let tokenizer_builder = DefaultBuilder::new();
let mut builder = UpdateBuilder::new(update_path, schema);
builder.update(0, &doc0).unwrap();
builder.update(1, &doc1).unwrap();
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
update.set_move(true);
database.ingest_update_file(update)?;
let view = database.view();
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
Ok(dir.close()?)
}
#[test]
fn ingest_two_update_files() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&rocksdb_path, &schema)?;
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let doc2 = SimpleDoc {
id: 2,
title: String::from("I am the third title"),
description: String::from("I am the third description"),
timestamp: 7654321,
};
let doc3 = SimpleDoc {
id: 3,
title: String::from("I am the fourth title"),
description: String::from("I am the fourth description"),
timestamp: 7654321,
};
let docid0;
let docid1;
let update1 = {
let tokenizer_builder = DefaultBuilder::new();
let update_path = dir.path().join("update-000.sst");
let mut builder = UpdateBuilder::new(update_path, schema.clone());
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
let docid2;
let docid3;
let update2 = {
let tokenizer_builder = DefaultBuilder::new();
let update_path = dir.path().join("update-001.sst");
let mut builder = UpdateBuilder::new(update_path, schema);
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
builder.build()?
};
database.ingest_update_file(update1)?;
database.ingest_update_file(update2)?;
let view = database.view();
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
assert_eq!(doc2, de_doc2);
assert_eq!(doc3, de_doc3);
Ok(dir.close()?)
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use super::*;
use std::error::Error;
use std::iter::repeat_with;
use self::test::Bencher;
use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use serde_derive::Serialize;
use rand::seq::SliceRandom;
use hashbrown::HashSet;
use crate::tokenizer::DefaultBuilder;
use crate::database::update::UpdateBuilder;
use crate::database::schema::*;
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
let mut words = String::new();
for i in 0..number {
let word_len = rng.gen_range(1, 12);
let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
words.extend(iter);
if i == number - 1 { // last word
let final_ = [".", "?", "!", "..."].choose(rng).cloned();
words.extend(final_);
} else {
let middle = [",", ", "].choose(rng).cloned();
words.extend(middle);
}
}
words
}
#[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
database.ingest_update_file(update)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
database.ingest_update_file(update)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
#[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
database.ingest_update_file(update)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
let view = database.ingest_update_file(update)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
let view = database.ingest_update_file(update)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
#[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
let view = database.ingest_update_file(update)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
}

View File

@ -1,29 +1,36 @@
use std::collections::{HashMap, BTreeMap};
use std::io::{Read, Write};
use std::{fmt, u32};
use std::path::Path;
use std::error::Error;
use std::{fmt, u16};
use std::ops::BitOr;
use std::sync::Arc;
use std::fs::File;
use serde_derive::{Serialize, Deserialize};
use linked_hash_map::LinkedHashMap;
use serde::Serialize;
use crate::database::serde::find_id::FindDocumentIdSerializer;
use crate::database::serde::SerializerError;
use crate::DocumentId;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
#[serde(default)]
stored: bool,
#[serde(default)]
indexed: bool,
}
impl SchemaProps {
pub fn is_stored(&self) -> bool {
pub fn is_stored(self) -> bool {
self.stored
}
pub fn is_indexed(&self) -> bool {
pub fn is_indexed(self) -> bool {
self.indexed
}
}
@ -39,33 +46,39 @@ impl BitOr for SchemaProps {
}
}
#[derive(Serialize, Deserialize)]
pub struct SchemaBuilder {
attrs: LinkedHashMap<String, SchemaProps>,
identifier: String,
attributes: LinkedHashMap<String, SchemaProps>,
}
impl SchemaBuilder {
pub fn new() -> SchemaBuilder {
SchemaBuilder { attrs: LinkedHashMap::new() }
pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
SchemaBuilder {
identifier: name.into(),
attributes: LinkedHashMap::new(),
}
}
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
let len = self.attrs.len();
if self.attrs.insert(name.into(), props).is_some() {
let len = self.attributes.len();
if self.attributes.insert(name.into(), props).is_some() {
panic!("Field already inserted.")
}
SchemaAttr(len as u32)
SchemaAttr(len as u16)
}
pub fn build(self) -> Schema {
let mut attrs = HashMap::new();
let mut props = Vec::new();
for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u32));
for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u16));
props.push((name, prop));
}
Schema { inner: Arc::new(InnerSchema { attrs, props }) }
let identifier = self.identifier;
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
}
}
@ -76,69 +89,124 @@ pub struct Schema {
#[derive(Debug, Clone, PartialEq, Eq)]
struct InnerSchema {
identifier: String,
attrs: HashMap<String, SchemaAttr>,
props: Vec<(String, SchemaProps)>,
}
impl Schema {
pub fn open<P: AsRef<Path>>(path: P) -> bincode::Result<Schema> {
let file = File::open(path)?;
Schema::read_from(file)
}
pub fn read_from<R: Read>(reader: R) -> bincode::Result<Schema> {
let attrs = bincode::deserialize_from(reader)?;
let builder = SchemaBuilder { attrs };
pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
let builder: SchemaBuilder = toml::from_slice(&buffer)?;
Ok(builder.build())
}
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
let string = toml::to_string_pretty(&builder)?;
writer.write_all(string.as_bytes())?;
Ok(())
}
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
Ok(builder.build())
}
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
let string = serde_json::to_string_pretty(&builder)?;
writer.write_all(string.as_bytes())?;
Ok(())
}
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
Ok(builder.build())
}
pub(crate) fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
bincode::serialize_into(writer, &builder)
}
fn attributes_ordered(&self) -> LinkedHashMap<String, SchemaProps> {
let mut ordered = BTreeMap::new();
for (name, field) in &self.inner.attrs {
let index = field.as_u32();
let (_, props) = self.inner.props[index as usize];
ordered.insert(index, (name, props));
for (name, attr) in &self.inner.attrs {
let (_, props) = self.inner.props[attr.0 as usize];
ordered.insert(attr.0, (name, props));
}
let mut attrs = LinkedHashMap::with_capacity(ordered.len());
let mut attributes = LinkedHashMap::with_capacity(ordered.len());
for (_, (name, props)) in ordered {
attrs.insert(name, props);
attributes.insert(name.clone(), props);
}
bincode::serialize_into(writer, &attrs)
attributes
}
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let id_attribute_name = &self.inner.identifier;
let serializer = FindDocumentIdSerializer { id_attribute_name };
document.serialize(serializer)
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let index = attr.as_u32();
let (_, props) = self.inner.props[index as usize];
let (_, props) = self.inner.props[attr.0 as usize];
props
}
pub fn identifier_name(&self) -> &str {
&self.inner.identifier
}
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
self.inner.attrs.get(name.as_ref()).cloned()
}
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
let index = attr.as_u32();
let (name, _) = &self.inner.props[index as usize];
let (name, _) = &self.inner.props[attr.0 as usize];
name
}
}
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
pub struct SchemaAttr(u32);
pub struct SchemaAttr(pub(crate) u16);
impl SchemaAttr {
pub fn new(value: u32) -> SchemaAttr {
pub fn new(value: u16) -> SchemaAttr {
SchemaAttr(value)
}
pub fn max() -> SchemaAttr {
SchemaAttr(u32::MAX)
pub fn min() -> SchemaAttr {
SchemaAttr(0)
}
pub fn as_u32(&self) -> u32 {
self.0
pub fn next(self) -> Option<SchemaAttr> {
self.0.checked_add(1).map(SchemaAttr)
}
pub fn prev(self) -> Option<SchemaAttr> {
self.0.checked_sub(1).map(SchemaAttr)
}
pub fn max() -> SchemaAttr {
SchemaAttr(u16::MAX)
}
}
@ -151,22 +219,92 @@ impl fmt::Display for SchemaAttr {
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::new();
builder.new_attribute("alphabet", STORED);
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
schema.write_to(&mut buffer)?;
let schema2 = Schema::read_from(buffer.as_slice())?;
schema.write_to_bin(&mut buffer)?;
let schema2 = Schema::read_from_bin(buffer.as_slice())?;
assert_eq!(schema, schema2);
Ok(())
}
#[test]
fn serialize_deserialize_toml() -> Result<(), Box<Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
schema.to_toml(&mut buffer)?;
let schema2 = Schema::from_toml(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
identifier = "id"
[attributes."alpha"]
stored = true
[attributes."beta"]
stored = true
indexed = true
[attributes."gamma"]
indexed = true
"#;
let schema2 = Schema::from_toml(data.as_bytes())?;
assert_eq!(schema, schema2);
Ok(())
}
#[test]
fn serialize_deserialize_json() -> Result<(), Box<Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
schema.to_json(&mut buffer)?;
let schema2 = Schema::from_json(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
{
"identifier": "id",
"attributes": {
"alpha": {
"stored": true
},
"beta": {
"stored": true,
"indexed": true
},
"gamma": {
"indexed": true
}
}
}"#;
let schema2 = Schema::from_json(data.as_bytes())?;
assert_eq!(schema, schema2);
Ok(())
}
}

View File

@ -0,0 +1,243 @@
use serde::Serialize;
use serde::ser;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::{SerializerError, calculate_hash};
use crate::DocumentId;
pub struct FindDocumentIdSerializer<'a> {
pub id_attribute_name: &'a str,
}
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
type Ok = DocumentId;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = FindDocumentIdMapSerializer<'a>;
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(FindDocumentIdMapSerializer {
id_attribute_name: self.id_attribute_name,
document_id: None,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(FindDocumentIdStructSerializer {
id_attribute_name: self.id_attribute_name,
document_id: None,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
pub struct FindDocumentIdMapSerializer<'a> {
id_attribute_name: &'a str,
document_id: Option<DocumentId>,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
type Ok = DocumentId;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
if self.id_attribute_name == key {
// TODO is it possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
}
}
pub struct FindDocumentIdStructSerializer<'a> {
id_attribute_name: &'a str,
document_id: Option<DocumentId>,
}
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
type Ok = DocumentId;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if self.id_attribute_name == key {
// TODO can it be possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
}
}

View File

@ -0,0 +1,196 @@
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{DocumentId, DocIndex, Attribute, WordArea};
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
pub struct IndexerSerializer<'a, B> {
pub tokenizer_builder: &'a B,
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for token in self.tokenizer_builder.build(v) {
let Token { word, word_index, char_index } = token;
let document_id = self.document_id;
// FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
Ok(attribute) => attribute,
Err(_) => return Ok(()),
};
// insert the exact representation
let word_lower = word.to_lowercase();
let length = word.chars().count() as u16;
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area,
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area };
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
}
let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area,
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

View File

@ -0,0 +1,146 @@
use serde::Serialize;
use serde::ser;
use crate::database::serde::SerializerError;
pub struct KeyToStringSerializer;
impl ser::Serializer for KeyToStringSerializer {
type Ok = String;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

57
src/database/serde/mod.rs Normal file
View File

@ -0,0 +1,57 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::error::Error;
use std::fmt;
use serde::ser;
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
pub mod find_id;
pub mod key_to_string;
pub mod serializer;
pub mod indexer_serializer;
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
}
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}

View File

@ -0,0 +1,286 @@
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Serializer<'a, B> {
pub schema: &'a Schema,
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for Serializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a, B>;
type SerializeStruct = StructSerializer<'a, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
update: self.update,
document_id: self.document_id,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
pub struct MapSerializer<'a, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>,
}
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, value);
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, value);
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}

View File

@ -0,0 +1,64 @@
use std::path::PathBuf;
use std::error::Error;
use hashbrown::HashSet;
use serde::Serialize;
use crate::database::serde::serializer::Serializer;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::Schema;
use crate::DocumentId;
use super::{Update, RawUpdateBuilder};
pub struct UpdateBuilder {
schema: Schema,
raw_builder: RawUpdateBuilder,
}
impl UpdateBuilder {
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
UpdateBuilder {
schema: schema,
raw_builder: RawUpdateBuilder::new(path),
}
}
pub fn update_document<T, B>(
&mut self,
document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError>
where T: Serialize,
B: TokenizerBuilder,
{
let document_id = self.schema.document_id(&document)?;
let update = self.raw_builder.document_update(document_id);
let serializer = Serializer {
schema: &self.schema,
document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: update,
stop_words: stop_words,
};
document.serialize(serializer)?;
Ok(document_id)
}
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let document_id = self.schema.document_id(&document)?;
self.raw_builder.document_update(document_id).remove();
Ok(document_id)
}
pub fn build(self) -> Result<Update, Box<Error>> {
self.raw_builder.build()
}
}

View File

@ -1,35 +1,17 @@
use std::path::PathBuf;
use std::error::Error;
use std::path::{Path, PathBuf};
mod negative;
mod positive;
mod builder;
mod raw_builder;
pub use self::positive::{PositiveUpdateBuilder, NewState};
pub use self::negative::NegativeUpdateBuilder;
pub use self::builder::UpdateBuilder;
pub use self::raw_builder::{RawUpdateBuilder, DocumentUpdate};
pub struct Update {
path: PathBuf,
can_be_moved: bool,
sst_file: PathBuf,
}
impl Update {
pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
Ok(Update { path: path.into(), can_be_moved: false })
}
pub fn open_and_move<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
Ok(Update { path: path.into(), can_be_moved: true })
}
pub fn set_move(&mut self, can_be_moved: bool) {
self.can_be_moved = can_be_moved
}
pub fn can_be_moved(&self) -> bool {
self.can_be_moved
}
pub fn into_path_buf(self) -> PathBuf {
self.path
pub fn path(&self) -> &Path {
&self.sst_file
}
}

View File

@ -1,4 +0,0 @@
mod update;
mod unordered_builder;
pub use self::update::NegativeUpdateBuilder;

View File

@ -1,37 +0,0 @@
use std::collections::BTreeSet;
use std::io;
use byteorder::{NativeEndian, WriteBytesExt};
use crate::DocumentId;
pub struct UnorderedNegativeBlobBuilder<W> {
doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
wrt: W,
}
impl UnorderedNegativeBlobBuilder<Vec<u8>> {
pub fn memory() -> Self {
UnorderedNegativeBlobBuilder::new(Vec::new())
}
}
impl<W: io::Write> UnorderedNegativeBlobBuilder<W> {
pub fn new(wrt: W) -> Self {
Self {
doc_ids: BTreeSet::new(),
wrt: wrt,
}
}
pub fn insert(&mut self, doc: DocumentId) -> bool {
self.doc_ids.insert(doc)
}
pub fn into_inner(mut self) -> io::Result<W> {
for id in self.doc_ids {
self.wrt.write_u64::<NativeEndian>(id)?;
}
Ok(self.wrt)
}
}

View File

@ -1,60 +0,0 @@
use std::path::PathBuf;
use std::error::Error;
use ::rocksdb::rocksdb_options;
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
use crate::database::blob::{Blob, NegativeBlob};
use crate::database::update::Update;
use crate::database::DocumentKey;
use crate::database::DATA_INDEX;
use crate::DocumentId;
pub struct NegativeUpdateBuilder {
path: PathBuf,
doc_ids: UnorderedNegativeBlobBuilder<Vec<u8>>,
}
impl NegativeUpdateBuilder {
pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
NegativeUpdateBuilder {
path: path.into(),
doc_ids: UnorderedNegativeBlobBuilder::memory(),
}
}
pub fn remove(&mut self, id: DocumentId) -> bool {
self.doc_ids.insert(id)
}
pub fn build(self) -> Result<Update, Box<Error>> {
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.path.to_string_lossy())?;
let bytes = self.doc_ids.into_inner()?;
let negative_blob = NegativeBlob::from_bytes(bytes)?;
let blob = Blob::Negative(negative_blob);
// write the data-index aka negative blob
let bytes = bincode::serialize(&blob)?;
file_writer.merge(DATA_INDEX, &bytes)?;
// FIXME remove this ugly thing !
// let Blob::Negative(negative_blob) = blob;
let negative_blob = match blob {
Blob::Negative(blob) => blob,
Blob::Positive(_) => unreachable!(),
};
for &document_id in negative_blob.as_ref().as_slice() {
let start = DocumentKey::new(document_id);
let end = start.with_attribute_max();
file_writer.delete_range(start.as_ref(), end.as_ref())?;
}
file_writer.finish()?;
Update::open(self.path)
}
}

View File

@ -1,4 +0,0 @@
mod update;
mod unordered_builder;
pub use self::update::{PositiveUpdateBuilder, NewState};

View File

@ -1,49 +0,0 @@
#![allow(unused)]
use std::collections::BTreeMap;
use std::error::Error;
use std::io::Write;
use sdset::Set;
use crate::database::blob::positive::PositiveBlobBuilder;
use crate::DocIndex;
pub struct UnorderedPositiveBlobBuilder<W, X> {
builder: PositiveBlobBuilder<W, X>,
map: BTreeMap<Vec<u8>, Vec<DocIndex>>,
}
impl UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
Self {
builder: PositiveBlobBuilder::memory(),
map: BTreeMap::new(),
}
}
}
impl<W: Write, X: Write> UnorderedPositiveBlobBuilder<W, X> {
pub fn new(map_wtr: W, doc_wtr: X) -> Result<Self, Box<Error>> {
Ok(UnorderedPositiveBlobBuilder {
builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?,
map: BTreeMap::new(),
})
}
pub fn insert<K: Into<Vec<u8>>>(&mut self, input: K, doc_index: DocIndex) {
self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index);
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> Result<(W, X), Box<Error>> {
for (key, mut doc_indexes) in self.map {
doc_indexes.sort_unstable();
self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?;
}
self.builder.into_inner()
}
}

View File

@ -1,514 +0,0 @@
use std::collections::BTreeMap;
use std::path::PathBuf;
use std::error::Error;
use std::fmt;
use ::rocksdb::rocksdb_options;
use serde::ser::{self, Serialize};
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
use crate::database::blob::positive::PositiveBlob;
use crate::database::schema::{Schema, SchemaAttr};
use crate::tokenizer::TokenizerBuilder;
use crate::database::DocumentKeyAttr;
use crate::database::update::Update;
use crate::{DocumentId, DocIndex};
use crate::database::DATA_INDEX;
use crate::database::blob::Blob;
pub enum NewState {
Updated { value: Vec<u8> },
Removed,
}
pub struct PositiveUpdateBuilder<B> {
path: PathBuf,
schema: Schema,
tokenizer_builder: B,
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: BTreeMap<DocumentKeyAttr, NewState>,
}
impl<B> PositiveUpdateBuilder<B> {
pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
PositiveUpdateBuilder {
path: path.into(),
schema: schema,
tokenizer_builder: tokenizer_builder,
builder: UnorderedPositiveBlobBuilder::memory(),
new_states: BTreeMap::new(),
}
}
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
where B: TokenizerBuilder
{
let serializer = Serializer {
schema: &self.schema,
document_id: id,
tokenizer_builder: &self.tokenizer_builder,
builder: &mut self.builder,
new_states: &mut self.new_states
};
Ok(ser::Serialize::serialize(document, serializer)?)
}
// TODO value must be a field that can be indexed
pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) {
let value = bincode::serialize(&value).unwrap();
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value });
}
pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) {
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed);
}
}
#[derive(Debug)]
pub enum SerializerError {
SchemaDontMatch { attribute: String },
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::SchemaDontMatch { attribute } => {
write!(f, "serialized document try to specify the \
{:?} attribute that is not known by the schema", attribute)
},
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}
struct Serializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
}
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
impl<'a, B> ser::Serializer for Serializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = StructSerializer<'a, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
// Ok(MapSerializer {
// schema: self.schema,
// document_id: self.document_id,
// new_states: self.new_states,
// })
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
builder: self.builder,
new_states: self.new_states,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
struct StructSerializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
}
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
match self.schema.attribute(key) {
Some(attr) => {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
let key = DocumentKeyAttr::new(self.document_id, attr);
self.new_states.insert(key, NewState::Updated { value });
}
if props.is_indexed() {
let serializer = IndexerSerializer {
builder: self.builder,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
};
value.serialize(serializer)?;
}
Ok(())
},
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
struct IndexerSerializer<'a, B> {
tokenizer_builder: &'a B,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
document_id: DocumentId,
attribute: SchemaAttr,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for (index, word) in self.tokenizer_builder.build(v) {
let doc_index = DocIndex {
document_id: self.document_id,
attribute: self.attribute.as_u32() as u8,
attribute_index: index as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
self.builder.insert(word_unidecoded, doc_index);
}
self.builder.insert(word_lower, doc_index);
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
impl<B> PositiveUpdateBuilder<B> {
pub fn build(self) -> Result<Update, Box<Error>> {
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.path.to_string_lossy())?;
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
let blob = Blob::Positive(positive_blob);
// write the data-index aka positive blob
let bytes = bincode::serialize(&blob)?;
file_writer.merge(DATA_INDEX, &bytes)?;
// write all the documents fields updates
for (key, state) in self.new_states {
match state {
NewState::Updated { value } => {
file_writer.put(key.as_ref(), &value)?
},
NewState::Removed => file_writer.delete(key.as_ref())?,
}
}
file_writer.finish()?;
Update::open(self.path)
}
}

View File

@ -0,0 +1,168 @@
use std::collections::btree_map::{BTreeMap, Entry};
use std::path::PathBuf;
use std::error::Error;
use rocksdb::rocksdb_options;
use hashbrown::HashMap;
use fst::map::Map;
use sdset::Set;
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
use crate::database::{DATA_INDEX, DocumentKeyAttr};
use crate::database::schema::SchemaAttr;
use crate::data::{DocIds, DocIndexes};
use crate::{DocumentId, DocIndex};
use super::Update;
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
type Value = Vec<u8>;
pub struct RawUpdateBuilder {
sst_file: PathBuf,
document_updates: BTreeMap<DocumentId, DocumentUpdate>,
}
pub struct DocumentUpdate {
cleared: bool,
words_indexes: HashMap<Token, Vec<DocIndex>>,
attributes: BTreeMap<SchemaAttr, Value>,
}
impl DocumentUpdate {
pub fn new() -> DocumentUpdate {
DocumentUpdate {
cleared: false,
words_indexes: HashMap::new(),
attributes: BTreeMap::new(),
}
}
pub fn remove(&mut self) {
self.cleared = true;
self.clear();
}
pub fn clear(&mut self) {
self.words_indexes.clear();
self.attributes.clear();
}
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: Vec<u8>) {
self.attributes.insert(attr, value);
}
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
}
}
impl RawUpdateBuilder {
pub fn new(path: PathBuf) -> RawUpdateBuilder {
RawUpdateBuilder {
sst_file: path,
document_updates: BTreeMap::new(),
}
}
pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate {
match self.document_updates.entry(document_id) {
Entry::Occupied(mut occupied) => {
occupied.get_mut().clear();
occupied.into_mut()
},
Entry::Vacant(vacant) => vacant.insert(DocumentUpdate::new()),
}
}
pub fn build(mut self) -> Result<Update, Box<Error>> {
let mut removed_document_ids = Vec::new();
let mut words_indexes = BTreeMap::new();
for (&id, update) in self.document_updates.iter_mut() {
if update.cleared { removed_document_ids.push(id) }
for (token, indexes) in &update.words_indexes {
words_indexes.entry(token).or_insert_with(Vec::new).extend_from_slice(indexes)
}
}
let negative = {
let removed_document_ids = Set::new_unchecked(&removed_document_ids);
let doc_ids = DocIds::new(removed_document_ids);
Negative::new(doc_ids)
};
let positive = {
let mut positive_builder = PositiveBuilder::memory();
for (key, mut indexes) in words_indexes {
indexes.sort_unstable();
let indexes = Set::new_unchecked(&indexes);
positive_builder.insert(key, indexes)?;
}
let (map, indexes) = positive_builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Positive::new(map, indexes)
};
let index = Index { negative, positive };
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.sst_file.to_string_lossy())?;
// write the data-index
let mut bytes = Vec::new();
index.write_to_bytes(&mut bytes);
file_writer.merge(DATA_INDEX, &bytes)?;
// write all the documents attributes updates
for (id, update) in self.document_updates {
let mut last_attr: Option<SchemaAttr> = None;
for (attr, value) in update.attributes {
if update.cleared {
// if there is no last attribute, remove from the first attribute
let start_attr = match last_attr {
Some(attr) => attr.next(),
None => Some(SchemaAttr::min())
};
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
let end = attr.prev().map(|a| DocumentKeyAttr::new(id, a));
// delete_range between (last_attr + 1) and (attr - 1)
if let (Some(start), Some(end)) = (start, end) {
file_writer.delete_range(start.as_ref(), end.as_ref())?;
}
}
let key = DocumentKeyAttr::new(id, attr);
file_writer.put(key.as_ref(), &value)?;
last_attr = Some(attr);
}
if update.cleared {
// if there is no last attribute, remove from the first attribute
let start_attr = match last_attr {
Some(attr) => attr.next(),
None => Some(SchemaAttr::min())
};
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
let end = DocumentKeyAttr::with_attribute_max(id);
// delete_range between (last_attr + 1) and attr_max
if let Some(start) = start {
file_writer.delete_range(start.as_ref(), end.as_ref())?;
}
}
}
file_writer.finish()?;
Ok(Update { sst_file: self.sst_file })
}
}

View File

@ -9,17 +9,17 @@ use serde::de::DeserializeOwned;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::database::{retrieve_data_schema, retrieve_data_index};
use crate::database::blob::positive::PositiveBlob;
use crate::database::deserializer::Deserializer;
use crate::database::schema::Schema;
use crate::rank::QueryBuilder;
use crate::database::index::Index;
use crate::rank::{QueryBuilder, FilterFunc};
use crate::DocumentId;
pub struct DatabaseView<D>
where D: Deref<Target=DB>
{
snapshot: Snapshot<D>,
blob: PositiveBlob,
index: Index,
schema: Schema,
}
@ -28,16 +28,16 @@ where D: Deref<Target=DB>
{
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
let schema = retrieve_data_schema(&snapshot)?;
let blob = retrieve_data_index(&snapshot)?;
Ok(DatabaseView { snapshot, blob, schema })
let index = retrieve_data_index(&snapshot)?;
Ok(DatabaseView { snapshot, index, schema })
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn blob(&self) -> &PositiveBlob {
&self.blob
pub fn index(&self) -> &Index {
&self.index
}
pub fn into_snapshot(self) -> Snapshot<D> {
@ -71,19 +71,18 @@ where D: Deref<Target=DB>
Ok(())
}
pub fn query_builder(&self) -> Result<QueryBuilder<D>, Box<Error>> {
pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
QueryBuilder::new(self)
}
// TODO create an enum error type
pub fn retrieve_document<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
where T: DeserializeOwned
{
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
Ok(T::deserialize(&mut deserializer)?)
}
pub fn retrieve_documents<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
where T: DeserializeOwned,
I: IntoIterator<Item=DocumentId>,
{
@ -100,7 +99,7 @@ where D: Deref<Target=DB>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut options = ReadOptions::new();
let lower = DocumentKey::new(0);
let lower = DocumentKey::new(DocumentId(0));
options.set_iterate_lower_bound(lower.as_ref());
let mut iter = self.snapshot.iter_opt(options);
@ -149,7 +148,7 @@ where D: Deref<Target=DB>,
fn next(&mut self) -> Option<Self::Item> {
match self.document_ids.next() {
Some(id) => Some(self.database_view.retrieve_document(id)),
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}
@ -168,7 +167,7 @@ where D: Deref<Target=DB>,
{
fn next_back(&mut self) -> Option<Self::Item> {
match self.document_ids.next_back() {
Some(id) => Some(self.database_view.retrieve_document(id)),
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}

View File

@ -1,40 +1,49 @@
#![cfg_attr(feature = "nightly", feature(test))]
pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
pub mod vec_read_only;
mod attribute;
mod word_area;
mod common_words;
pub use rocksdb;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;
pub use self::attribute::{Attribute, AttributeError};
pub use self::word_area::{WordArea, WordAreaError};
pub type DocumentId = u64;
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute identifier in the document
/// where the word was found.
///
/// This is an `u8` therefore a document
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: Attribute,
/// The index where the word was found in the attribute.
/// The position in bytes where the word was found
/// along with the length of it.
///
/// Only the first 1000 words are indexed.
pub attribute_index: u32,
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub word_area: WordArea,
}
/// This structure represent a matching word with informations
@ -45,7 +54,7 @@ pub struct DocIndex {
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
@ -57,23 +66,19 @@ pub struct Match {
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in which the word is located
/// (i.e. Title is 0, Description is 1).
///
/// This is an `u8` therefore a document
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// Where does this word is located in the attribute string
/// (i.e. at the start or the end of the attribute).
///
/// The index in the attribute is limited to a maximum of `2^32`
/// this is because we index only the first 1000 words
/// in an attribute.
pub attribute_index: u32,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: Attribute,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub word_area: WordArea,
}
impl Match {
@ -81,9 +86,9 @@ impl Match {
Match {
query_index: 0,
distance: 0,
attribute: 0,
attribute_index: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 0),
}
}
@ -91,9 +96,20 @@ impl Match {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u8::max_value(),
attribute_index: u32::max_value(),
attribute: Attribute::max_value(),
is_exact: true,
word_area: WordArea::max_value(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

View File

@ -10,13 +10,13 @@ use crate::database::DatabaseView;
use crate::Match;
#[inline]
fn contains_exact(matches: &[Match]) -> bool {
fn contains_exact(matches: &&[Match]) -> bool {
matches.iter().any(|m| m.is_exact)
}
#[inline]
fn number_exact_matches(matches: &[Match]) -> usize {
GroupBy::new(matches, match_query_index).map(contains_exact).count()
GroupBy::new(matches, match_query_index).filter(contains_exact).count()
}
#[derive(Debug, Clone, Copy)]

View File

@ -29,7 +29,6 @@ pub use self::{
pub trait Criterion<D>
where D: Deref<Target=DB>
{
#[inline]
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
#[inline]
@ -62,6 +61,7 @@ where D: Deref<Target=DB>
}
}
#[derive(Default)]
pub struct CriteriaBuilder<D>
where D: Deref<Target=DB>
{

View File

@ -46,13 +46,18 @@ use crate::rank::Document;
/// let criterion = builder.build();
///
/// ```
#[derive(Default)]
pub struct SortBy<T> {
_phantom: marker::PhantomData<T>,
}
impl<T> SortBy<T> {
pub fn new() -> Self {
SortBy::default()
}
}
impl<T> Default for SortBy<T> {
fn default() -> SortBy<T> {
SortBy { _phantom: marker::PhantomData }
}
}
@ -62,12 +67,12 @@ where D: Deref<Target=DB>,
T: DeserializeOwned + Ord,
{
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
let lhs = match view.retrieve_document::<T>(lhs.id) {
let lhs = match view.document_by_id::<T>(lhs.id) {
Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None },
};
let rhs = match view.retrieve_document::<T>(rhs.id) {
let rhs = match view.document_by_id::<T>(rhs.id) {
Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None },
};

View File

@ -11,14 +11,14 @@ use crate::database::DatabaseView;
use crate::Match;
#[inline]
fn sum_matches_typos(matches: &[Match]) -> i8 {
fn sum_matches_typos(matches: &[Match]) -> isize {
let mut sum_typos = 0;
let mut number_words = 0;
// note that GroupBy will never return an empty group
// so we can do this assumption safely
for group in GroupBy::new(matches, match_query_index) {
sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
sum_typos += unsafe { group.get_unchecked(0).distance as isize };
number_words += 1;
}
@ -44,6 +44,8 @@ where D: Deref<Target=DB>
mod tests {
use super::*;
use crate::{DocumentId, Attribute, WordArea};
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
@ -52,22 +54,46 @@ mod tests {
fn one_typo_reference() {
let doc0 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: 0,
id: DocumentId(0),
matches: matches,
}
};
let doc1 = {
let matches = vec![
Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
Match {
query_index: 0,
distance: 1,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: 1,
id: DocumentId(1),
matches: matches,
}
};
@ -85,21 +111,39 @@ mod tests {
fn no_typo() {
let doc0 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: 0,
id: DocumentId(0),
matches: matches,
}
};
let doc1 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: 1,
id: DocumentId(1),
matches: matches,
}
};
@ -117,21 +161,39 @@ mod tests {
fn one_typo() {
let doc0 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 1,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: 0,
id: DocumentId(0),
matches: matches,
}
};
let doc1 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: 1,
id: DocumentId(1),
matches: matches,
}
};

View File

@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion;
use crate::Match;
#[inline]
fn sum_matches_attributes(matches: &[Match]) -> u8 {
fn sum_matches_attributes(matches: &[Match]) -> usize {
// note that GroupBy will never return an empty group
// so we can do this assumption safely
GroupBy::new(matches, match_query_index).map(|group| unsafe {
group.get_unchecked(0).attribute
GroupBy::new(matches, match_query_index).map(|group| {
unsafe { group.get_unchecked(0).attribute.attribute() as usize }
}).sum()
}

View File

@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion;
use crate::Match;
#[inline]
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
fn sum_matches_attribute_index(matches: &[Match]) -> usize {
// note that GroupBy will never return an empty group
// so we can do this assumption safely
GroupBy::new(matches, match_query_index).map(|group| unsafe {
group.get_unchecked(0).attribute_index
GroupBy::new(matches, match_query_index).map(|group| {
unsafe { group.get_unchecked(0).attribute.word_index() as usize }
}).sum()
}

View File

@ -20,8 +20,8 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
}
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
if lhs.attribute != rhs.attribute { return MAX_DISTANCE }
index_proximity(lhs.attribute_index, rhs.attribute_index)
if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
}
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
@ -67,6 +67,8 @@ where D: Deref<Target=DB>
mod tests {
use super::*;
use crate::Attribute;
#[test]
fn three_different_attributes() {
@ -79,11 +81,11 @@ mod tests {
// { id: 3, attr: 3, attr_index: 1 }
let matches = &[
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() },
Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() },
Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() },
Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
];
// soup -> of = 8
@ -105,12 +107,12 @@ mod tests {
// { id: 3, attr: 1, attr_index: 3 }
let matches = &[
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() },
Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() },
Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() },
Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() },
Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
];
// soup -> of = 1
@ -119,3 +121,42 @@ mod tests {
assert_eq!(matches_proximity(matches), 3);
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use super::*;
use std::error::Error;
use self::test::Bencher;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use crate::Attribute;
#[bench]
fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box<Error>> {
let number_matches = 30_000;
let mut matches = Vec::with_capacity(number_matches);
let mut rng = XorShiftRng::seed_from_u64(42);
for _ in 0..number_matches {
let query_index = rng.gen_range(0, 4);
let attribute = rng.gen_range(0, 5);
let word_index = rng.gen_range(0, 15);
let attribute = Attribute::new_faillible(attribute, word_index);
let match_ = Match { query_index, attribute, ..Match::zero() };
matches.push(match_);
}
bench.iter(|| {
let proximity = matches_proximity(&matches);
test::black_box(move || proximity)
});
Ok(())
}
}

View File

@ -4,7 +4,7 @@ mod distinct_map;
use crate::{Match, DocumentId};
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
#[inline]
fn match_query_index(a: &Match, b: &Match) -> bool {

View File

@ -4,10 +4,11 @@ use std::error::Error;
use std::hash::Hash;
use std::rc::Rc;
use group_by::GroupByMut;
use group_by::BinaryGroupByMut;
use hashbrown::HashMap;
use fst::Streamer;
use rocksdb::DB;
use log::info;
use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
@ -34,14 +35,17 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
automatons
}
pub struct QueryBuilder<'a, D>
pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
pub struct QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>
{
view: &'a DatabaseView<D>,
criteria: Criteria<D>,
filter: Option<FI>,
}
impl<'a, D> QueryBuilder<'a, D>
impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
where D: Deref<Target=DB>
{
pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
@ -49,19 +53,27 @@ where D: Deref<Target=DB>
}
}
impl<'a, D> QueryBuilder<'a, D>
where D: Deref<Target=DB>
impl<'a, D, FI> QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>,
{
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> {
Ok(QueryBuilder { view, criteria })
Ok(QueryBuilder { view, criteria, filter: None })
}
pub fn criteria(&mut self, criteria: Criteria<D>) -> &mut Self {
self.criteria = criteria;
self
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
QueryBuilder {
view: self.view,
criteria: self.criteria,
filter: Some(function)
}
}
pub fn with_distinct<F>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, F> {
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
K: Hash + Eq,
{
DistinctQueryBuilder {
inner: self,
function: function,
@ -75,12 +87,13 @@ where D: Deref<Target=DB>
let mut stream = {
let mut op_builder = fst::map::OpBuilder::new();
for automaton in &automatons {
let stream = self.view.blob().as_map().search(automaton);
let stream = self.view.index().positive.map().search(automaton);
op_builder.push(stream);
}
op_builder.union()
};
let mut number_matches = 0;
let mut matches = HashMap::new();
while let Some((input, indexed_values)) = stream.next() {
@ -89,39 +102,55 @@ where D: Deref<Target=DB>
let distance = automaton.eval(input).to_u8();
let is_exact = distance == 0 && input.len() == automaton.query_len();
let doc_indexes = self.view.blob().as_indexes();
let doc_indexes = &self.view.index().positive.indexes();
let doc_indexes = &doc_indexes[iv.value as usize];
number_matches += doc_indexes.len();
for doc_index in doc_indexes {
let match_ = Match {
query_index: iv.index as u32,
distance: distance,
attribute: doc_index.attribute,
attribute_index: doc_index.attribute_index,
is_exact: is_exact,
word_area: doc_index.word_area,
};
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
}
}
}
matches.into_iter().map(|(id, matches)| Document::from_matches(id, matches)).collect()
info!("{} total documents to classify", matches.len());
info!("{} total matches to classify", number_matches);
matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
}
}
impl<'a, D> QueryBuilder<'a, D>
impl<'a, D, FI> QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>,
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
let mut documents = self.query_all(query);
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
// We give the filtering work to the query distinct builder,
// specifying a distinct rule that has no effect.
if self.filter.is_some() {
let builder = self.with_distinct(|_, _| None as Option<()>, 1);
return builder.query(query, range);
}
let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query));
info!("query_all took {}", elapsed);
let mut groups = vec![documents.as_mut_slice()];
let view = &self.view;
'criteria: for criterion in self.criteria.as_ref() {
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < range.start {
@ -130,9 +159,12 @@ where D: Deref<Target=DB>,
continue;
}
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
let (elapsed, ()) = elapsed::measure_time(|| {
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
});
info!("criterion {} sort took {}", ci, elapsed);
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
documents_seen += group.len();
groups.push(group);
@ -152,25 +184,41 @@ where D: Deref<Target=DB>,
}
}
pub struct DistinctQueryBuilder<'a, D, F>
pub struct DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>
{
inner: QueryBuilder<'a, D>,
function: F,
inner: QueryBuilder<'a, D, FI>,
function: FD,
size: usize,
}
impl<'a, D, F, K> DistinctQueryBuilder<'a, D, F>
impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>,
F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
{
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
DistinctQueryBuilder {
inner: self.inner.with_filter(function),
function: self.function,
size: self.size
}
}
}
impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>,
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
K: Hash + Eq,
{
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
let mut documents = self.inner.query_all(query);
let mut groups = vec![documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let view = &self.inner.view;
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
@ -193,17 +241,27 @@ where D: Deref<Target=DB>,
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
let filter_accepted = match &self.inner.filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id, view))
},
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end { break }
}
@ -229,16 +287,22 @@ where D: Deref<Target=DB>,
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
for document in documents.into_iter().skip(distinct_raw_offset) {
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
let accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
let filter_accepted = match &self.inner.filter {
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
None => true,
};
if accepted && seen.len() > range.start {
out_documents.push(document);
if out_documents.len() == range.len() { break }
if filter_accepted {
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
out_documents.push(document);
if out_documents.len() == range.len() { break }
}
}
}

View File

@ -2,7 +2,7 @@ use std::mem;
use self::Separator::*;
pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
}
pub struct DefaultBuilder;
@ -13,22 +13,39 @@ impl DefaultBuilder {
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
impl TokenizerBuilder for DefaultBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
Box::new(Tokenizer::new(text))
}
}
pub struct Tokenizer<'a> {
index: usize,
word_index: usize,
char_index: usize,
inner: &'a str,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
let mut char_advance = 0;
let mut index_advance = 0;
for (n, (i, c)) in string.char_indices().enumerate() {
char_advance = n;
index_advance = i;
if detect_separator(c).is_none() { break }
}
Tokenizer {
index: 0,
inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]),
word_index: 0,
char_index: char_advance,
inner: &string[index_advance..],
}
}
}
@ -56,43 +73,58 @@ impl Separator {
}
}
fn detect_separator(c: char) -> Option<Separator> {
match c {
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
' ' | '\'' | '"' => Some(Short),
_ => None,
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = (usize, &'a str);
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut start_word = None;
let mut distance = None;
for (i, c) in self.inner.char_indices() {
let separator = match c {
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
' ' | '\'' | '"' => Some(Short),
_ => None,
};
match separator {
Some(dist) => {
match detect_separator(c) {
Some(sep) => {
if let Some(start_word) = start_word {
let (word, tail) = self.inner.split_at(i);
let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail;
self.index += distance.map(Separator::to_usize).unwrap_or(0);
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let word = &word[start_word..];
return Some((self.index, word))
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.char_index += word.chars().count();
return Some(token)
}
distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
distance.replace(distance.map_or(sep, |s| s.add(sep)));
},
None => { start_word.get_or_insert(i); },
}
}
if let Some(start_word) = start_word {
let word = mem::replace(&mut self.inner, "");
self.index += distance.map(Separator::to_usize).unwrap_or(0);
let prefix = mem::replace(&mut self.inner, "");
let (spaces, word) = prefix.split_at(start_word);
let word = &word[start_word..];
return Some((self.index, word))
let token = Token {
word: word,
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
char_index: self.char_index + spaces.chars().count(),
};
return Some(token)
}
None
@ -107,12 +139,12 @@ mod tests {
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some((0, "salut")));
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some((0, "yo")));
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
@ -120,18 +152,37 @@ mod tests {
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
assert_eq!(tokenizer.next(), Some((0, "yo")));
assert_eq!(tokenizer.next(), Some((1, "lolo")));
assert_eq!(tokenizer.next(), Some((9, "aïe")));
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some((0, "yo")));
assert_eq!(tokenizer.next(), Some((8, "lolo")));
assert_eq!(tokenizer.next(), Some((16, "wtf")));
assert_eq!(tokenizer.next(), Some((24, "lol")));
assert_eq!(tokenizer.next(), Some((32, "aïe")));
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
}

View File

@ -1,51 +0,0 @@
use std::ops::Deref;
use std::sync::Arc;
use std::fmt;
#[derive(Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct VecReadOnly<T> {
inner: Arc<Vec<T>>,
offset: usize,
len: usize,
}
impl<T> VecReadOnly<T> {
pub fn new(vec: Vec<T>) -> Self {
let len = vec.len();
Self {
inner: Arc::new(vec),
offset: 0,
len: len,
}
}
pub fn len(&self) -> usize {
self.len
}
pub fn range(&self, offset: usize, len: usize) -> Self {
Self {
inner: self.inner.clone(),
offset: self.offset + offset,
len: len,
}
}
pub fn as_slice(&self) -> &[T] {
&self.inner[self.offset..self.offset + self.len]
}
}
impl<T> Deref for VecReadOnly<T> {
type Target = [T];
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl<T: fmt::Debug> fmt::Debug for VecReadOnly<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.inner.fmt(f)
}
}

102
src/word_area.rs Normal file
View File

@ -0,0 +1,102 @@
use std::fmt;
/// Represent a word position in bytes along with the length of it.
///
/// It can represent words byte index to maximum 2^22 and
/// up to words of length 1024.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct WordArea(u32);
impl WordArea {
/// Construct a `WordArea` from a word position in expresed as
/// a number of characters and the length of it.
///
/// # Panics
///
/// The char index must not be greater than 2^22
/// and the length not greater than 1024.
pub(crate) fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
return Err(WordAreaError::ByteIndexTooBig)
}
if length & 0b1111_1100_0000_0000 != 0 {
return Err(WordAreaError::LengthTooBig)
}
let char_index = char_index << 10;
Ok(WordArea(char_index | u32::from(length)))
}
pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea {
match WordArea::new(char_index, length) {
Ok(word_area) => word_area,
Err(WordAreaError::ByteIndexTooBig) => {
panic!("word area byte index must not be greater than 2^22")
},
Err(WordAreaError::LengthTooBig) => {
panic!("word area length must not be greater than 1024")
},
}
}
pub(crate) fn max_value() -> WordArea {
WordArea(u32::max_value())
}
#[inline]
pub fn char_index(self) -> u32 {
self.0 >> 10
}
#[inline]
pub fn length(self) -> u16 {
(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
}
}
impl fmt::Debug for WordArea {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("WordArea")
.field("char_index", &self.char_index())
.field("length", &self.length())
.finish()
}
}
pub enum WordAreaError {
ByteIndexTooBig,
LengthTooBig,
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck::{quickcheck, TestResult};
quickcheck! {
fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult {
if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
return TestResult::discard()
}
let word_area = WordArea::new_faillible(gen_char_index, gen_length);
let valid_char_index = word_area.char_index() == gen_char_index;
let valid_length = word_area.length() == gen_length;
TestResult::from_bool(valid_char_index && valid_length)
}
fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult {
if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
return TestResult::discard()
}
let a = WordArea::new_faillible(gen_char_index, gen_length);
let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1);
TestResult::from_bool(a < b)
}
}
}