mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-19 04:50:37 +00:00
Compare commits
134 Commits
Author | SHA1 | Date | |
---|---|---|---|
c615c31016 | |||
908b28790b | |||
4c0279729b | |||
96dfac5b33 | |||
8576218b51 | |||
1c1f9201b8 | |||
4398b88a3a | |||
73e79f5ca4 | |||
1bfd51d6e9 | |||
0d2daf27f2 | |||
87f0d8cf3c | |||
06d5a10902 | |||
94b89c5439 | |||
c5e951be09 | |||
66ae5c8161 | |||
8438e2202f | |||
7a6166d229 | |||
d46fa4b215 | |||
2bd5b4ab86 | |||
5efbc5ceb3 | |||
2e905bac08 | |||
4c0ad5f964 | |||
455cbf3bf4 | |||
a3a28c56fa | |||
b0b3175641 | |||
c2f0df3f73 | |||
820f1f9ac6 | |||
337aee5b65 | |||
810dfdf656 | |||
f016652fca | |||
6c99ebe3fa | |||
94d357985f | |||
fbc698567a | |||
aa9db14c09 | |||
61e83a1c21 | |||
1316be5b09 | |||
4e8b0383dd | |||
4fa10753c1 | |||
2473e289e8 | |||
e0e5e87ed3 | |||
b13e61f40a | |||
c023cb3065 | |||
0a3d069fbc | |||
fa062ce2cf | |||
cdc6e47bf5 | |||
d5f44838be | |||
5939f6e68a | |||
97edc987f8 | |||
e4e50cecce | |||
77e0c19749 | |||
251bccbbc3 | |||
f7561f8552 | |||
05fd7e87ec | |||
446d6a5455 | |||
78786a0007 | |||
3d820a27ee | |||
ac347d788c | |||
5627f15d41 | |||
e31afc2da2 | |||
77c252e12a | |||
30c9c053c2 | |||
b53ef08d05 | |||
86bfb173ef | |||
8e5f834625 | |||
563b021679 | |||
681f721b1d | |||
8a7c061539 | |||
8c781a4d05 | |||
de59ea495d | |||
966eda8ae5 | |||
32f8908d71 | |||
a2f5e8aa25 | |||
f00b978801 | |||
a78b5d225f | |||
f32a59720d | |||
2cc5fbde1a | |||
34d2850d28 | |||
023f62b0ce | |||
7f35b971f0 | |||
3418adb06a | |||
510426c05c | |||
c74caa0f82 | |||
d899b86603 | |||
0d07af3caf | |||
c594597a01 | |||
ef7ba96d4a | |||
d21406a939 | |||
039a9a4cc7 | |||
40ab9e7a55 | |||
d21abb50fa | |||
3dd5e2445a | |||
7f5e6c5b6e | |||
e6d3840f12 | |||
c05fab783a | |||
95dc6fe904 | |||
b2e9ae4136 | |||
b070778d44 | |||
6731025003 | |||
04544c1531 | |||
9dd68b4eaa | |||
1d67012aa5 | |||
e723e01ec8 | |||
7845292ea8 | |||
521df85c0d | |||
dfa19582a2 | |||
87ec95f7a0 | |||
76ef2cceeb | |||
20b5a6a06e | |||
a842e647f7 | |||
21bb38c3b0 | |||
64d53ee1bd | |||
c022fa3fca | |||
0080bf486f | |||
6bd779f9ae | |||
a18401f47e | |||
7132c3be89 | |||
aa3d059363 | |||
e2a9dbc404 | |||
a0a11faee5 | |||
36ef9581aa | |||
f4b04dfb72 | |||
cf5d56e63a | |||
8412c14b5b | |||
70772eca5c | |||
b27f632e14 | |||
e3bfb866e5 | |||
fa238f21ef | |||
444a4c1af7 | |||
2e5c5fad33 | |||
b32c96cdc9 | |||
62521262e8 | |||
4ebae7784c | |||
a756ca5e3f | |||
aa104fa253 |
@ -11,8 +11,8 @@ matrix:
|
||||
include:
|
||||
|
||||
# Test crates on their minimum Rust versions.
|
||||
- rust: 1.31.0
|
||||
name: "meilidb on 1.31.0"
|
||||
- rust: 1.32.0
|
||||
name: "meilidb on 1.32.0"
|
||||
script: ./ci/meilidb.sh
|
||||
|
||||
# Test crates on nightly Rust.
|
||||
|
36
Cargo.toml
36
Cargo.toml
@ -1,39 +1,55 @@
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "meilidb"
|
||||
version = "0.1.0"
|
||||
version = "0.3.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
||||
[dependencies]
|
||||
bincode = "1.0"
|
||||
byteorder = "1.2"
|
||||
crossbeam = "0.6"
|
||||
elapsed = "0.1"
|
||||
fst = "0.3"
|
||||
hashbrown = "0.1"
|
||||
hashbrown = { version = "0.1", features = ["serde"] }
|
||||
lazy_static = "1.1"
|
||||
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
|
||||
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
|
||||
log = "0.4"
|
||||
sdset = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = { version = "1.0", features = ["preserve_order"] }
|
||||
slice-group-by = "0.2"
|
||||
unidecode = "0.3"
|
||||
rayon = "1.0"
|
||||
lockfree = "0.5.1"
|
||||
|
||||
[dependencies.toml]
|
||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||
features = ["preserve_order"]
|
||||
rev = "0372ba6"
|
||||
|
||||
[dependencies.rocksdb]
|
||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||
rev = "c2eb140"
|
||||
|
||||
[dependencies.group-by]
|
||||
git = "https://github.com/Kerollmops/group-by.git"
|
||||
rev = "cab857b"
|
||||
rev = "306e201"
|
||||
|
||||
[features]
|
||||
default = ["simd"]
|
||||
i128 = ["bincode/i128", "byteorder/i128"]
|
||||
simd = ["rocksdb/sse"]
|
||||
portable = ["rocksdb/portable"]
|
||||
nightly = []
|
||||
simd = ["rocksdb/sse"]
|
||||
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
||||
|
||||
[dev-dependencies]
|
||||
csv = "1.0"
|
||||
elapsed = "0.1"
|
||||
env_logger = "0.6"
|
||||
jemallocator = "0.1"
|
||||
quickcheck = "0.8"
|
||||
rand = "0.6"
|
||||
rand_xorshift = "0.1"
|
||||
structopt = "0.2"
|
||||
tempfile = "3.0"
|
||||
termcolor = "1.0"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
|
49
README.md
49
README.md
@ -1,47 +1,60 @@
|
||||
# MeiliDB
|
||||
|
||||
[](https://travis-ci.org/Kerollmops/MeiliDB)
|
||||
[](https://deps.rs/repo/github/Kerollmops/MeiliDB)
|
||||
[](https://github.com/Kerollmops/MeiliDB)
|
||||
[](
|
||||
https://www.rust-lang.org)
|
||||
|
||||
A _full-text search database_ using a key-value store internally.
|
||||
|
||||
It uses [RocksDB](https://github.com/facebook/rocksdb) like a classic database, to store documents and internal data. The key-value store power allow us to handle updates and queries with small memory and CPU overheads.
|
||||
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
|
||||
|
||||
You can [read the deep dive](deep-dive.md) if you want more informations on the engine, it describes the whole process of generating updates and handling queries.
|
||||
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.
|
||||
|
||||
We will be proud if you send pull requests to help us grow this project, you can start with [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) to start !
|
||||
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
|
||||
|
||||
At the moment this is a library only, this means that binaries are not part of this repository but since I'm still nice I have made some examples for you in the `examples/` folder that works with the data located in the `misc/` folder.
|
||||
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.
|
||||
|
||||
In a near future MeiliDB we be a binary like any database: updated and queried using some kind of protocol. It is the final goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). MeiliDB will just be a bunch of network and protocols functions wrapping the library which itself will be published to https://crates.io, following the same update cycle.
|
||||
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/Kerollmops/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
|
||||
|
||||
|
||||
|
||||
## Performances
|
||||
|
||||
_these informations have been made with a version dated of october 2018, we must update them_
|
||||
With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
|
||||
So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.
|
||||
|
||||
We made some tests on remote machines and found that we can handle with a dataset of near 280k products, on a server that cost 5$/month with 1vCPU and 1GB of ram and on the same index and with a simple query:
|
||||
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.
|
||||
|
||||
- near 190 users with an average response time of 90ms
|
||||
- 150 users with an average response time of 70ms
|
||||
- 100 users with an average response time of 45ms
|
||||
|
||||
Network is mesured, servers are located in amsterdam and tests are made between two different datacenters.
|
||||
```
|
||||
Running 10s test @ http://localhost:2230
|
||||
2 threads and 12 connections
|
||||
Thread Stats Avg Stdev Max +/- Stdev
|
||||
Latency 18.86ms 49.39ms 614.89ms 95.23%
|
||||
Req/Sec 620.41 59.53 790.00 65.00%
|
||||
12359 requests in 10.00s, 3.26MB read
|
||||
Requests/sec: 1235.54
|
||||
Transfer/sec: 334.22KB
|
||||
```
|
||||
|
||||
### Notes
|
||||
|
||||
The default Rust allocator has recently been [changed to use the system allocator](https://github.com/rust-lang/rust/pull/51241/).
|
||||
We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).
|
||||
|
||||
## Usage and examples
|
||||
|
||||
MeiliDB work with an index like most of the search engines.
|
||||
MeiliDB runs with an index like most search engines.
|
||||
So to test the library you can create one by indexing a simple csv file.
|
||||
|
||||
```bash
|
||||
cargo run --release --example create-database -- test.mdb misc/kaggle.csv
|
||||
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
|
||||
```
|
||||
|
||||
Once the command finished indexing the database should have been saved under the `test.mdb` folder.
|
||||
|
||||
Now you can easily run the `query-database` example to check what is stored in it.
|
||||
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
|
||||
|
||||
```bash
|
||||
cargo run --release --example query-database -- test.mdb
|
||||
cargo run --release --example query-database -- test.mdb -n 10 id title
|
||||
```
|
||||
|
||||
|
@ -1,91 +1,131 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::error::Error;
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED};
|
||||
use meilidb::database::update::PositiveUpdateBuilder;
|
||||
use meilidb::database::{Database, Schema};
|
||||
use meilidb::tokenizer::DefaultBuilder;
|
||||
use meilidb::database::Database;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Opt {
|
||||
/// The destination where the database must be created
|
||||
/// The destination where the database must be created.
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
/// The csv file to index.
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub csv_data_path: PathBuf,
|
||||
|
||||
/// The path to the schema.
|
||||
#[structopt(long = "schema", parse(from_os_str))]
|
||||
pub schema_path: PathBuf,
|
||||
|
||||
/// The path to the list of stop words (one by line).
|
||||
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||
pub stop_words_path: Option<PathBuf>,
|
||||
|
||||
#[structopt(long = "update-group-size")]
|
||||
pub update_group_size: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Document<'a> {
|
||||
id: &'a str,
|
||||
title: &'a str,
|
||||
description: &'a str,
|
||||
image: &'a str,
|
||||
}
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct Document<'a> (
|
||||
#[serde(borrow)]
|
||||
HashMap<Cow<'a, str>, Cow<'a, str>>
|
||||
);
|
||||
|
||||
fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s = DefaultHasher::new();
|
||||
t.hash(&mut s);
|
||||
s.finish()
|
||||
}
|
||||
fn index(
|
||||
schema: Schema,
|
||||
database_path: &Path,
|
||||
csv_data_path: &Path,
|
||||
update_group_size: Option<usize>,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<Database, Box<Error>>
|
||||
{
|
||||
let database = Database::create(database_path)?;
|
||||
|
||||
fn create_schema() -> Schema {
|
||||
let mut schema = SchemaBuilder::new();
|
||||
schema.new_attribute("id", STORED);
|
||||
schema.new_attribute("title", STORED | INDEXED);
|
||||
schema.new_attribute("description", STORED | INDEXED);
|
||||
schema.new_attribute("image", STORED);
|
||||
schema.build()
|
||||
}
|
||||
|
||||
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
|
||||
let database = Database::create(database_path, schema.clone())?;
|
||||
|
||||
println!("start indexing...");
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = tempfile::NamedTempFile::new()?;
|
||||
let mut update = PositiveUpdateBuilder::new(update_path.path(), schema, tokenizer_builder);
|
||||
database.create_index("default", &schema)?;
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
let headers = rdr.headers()?.clone();
|
||||
|
||||
while rdr.read_record(&mut raw_record)? {
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
Err(e) => {
|
||||
eprintln!("{:?}", e);
|
||||
continue;
|
||||
let mut i = 0;
|
||||
let mut end_of_file = false;
|
||||
|
||||
while !end_of_file {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut update = database.start_update("default")?;
|
||||
|
||||
loop {
|
||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||
if end_of_file { break }
|
||||
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
Err(e) => {
|
||||
eprintln!("{:?}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
update.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
print!("\rindexing document {}", i);
|
||||
i += 1;
|
||||
|
||||
if let Some(group_size) = update_group_size {
|
||||
if i % group_size == 0 { break }
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let document_id = calculate_hash(&document.id);
|
||||
update.update(document_id, &document).unwrap();
|
||||
println!();
|
||||
|
||||
println!("committing update...");
|
||||
database.commit_update(update)?;
|
||||
}
|
||||
|
||||
let mut update = update.build()?;
|
||||
|
||||
update.set_move(true);
|
||||
database.ingest_update_file(update)?;
|
||||
|
||||
Ok(database)
|
||||
}
|
||||
|
||||
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
|
||||
let f = File::open(path)?;
|
||||
let reader = BufReader::new(f);
|
||||
let mut words = HashSet::new();
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
let word = line.trim().to_string();
|
||||
words.insert(word);
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let schema = create_schema();
|
||||
let schema = {
|
||||
let file = File::open(&opt.schema_path)?;
|
||||
Schema::from_toml(file)?
|
||||
};
|
||||
|
||||
let stop_words = match opt.stop_words_path {
|
||||
Some(ref path) => retrieve_stop_words(path)?,
|
||||
None => HashSet::new(),
|
||||
};
|
||||
|
||||
let (elapsed, result) = elapsed::measure_time(|| {
|
||||
index(schema, &opt.database_path, &opt.csv_data_path)
|
||||
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
@ -93,6 +133,5 @@ fn main() -> Result<(), Box<Error>> {
|
||||
}
|
||||
|
||||
println!("database created in {} at: {:?}", elapsed, opt.database_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,11 +1,19 @@
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::iter::FromIterator;
|
||||
use std::io::{self, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use meilidb::database::schema::SchemaAttr;
|
||||
use meilidb::database::Database;
|
||||
use meilidb::Match;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Opt {
|
||||
@ -13,20 +21,85 @@ pub struct Opt {
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
/// Fields that must be displayed.
|
||||
pub displayed_fields: Vec<String>,
|
||||
|
||||
/// The number of returned results
|
||||
#[structopt(short = "n", long = "number-results", default_value = "10")]
|
||||
pub number_results: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Document {
|
||||
id: String,
|
||||
title: String,
|
||||
description: String,
|
||||
image: String,
|
||||
type Document = HashMap<String, String>;
|
||||
|
||||
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
||||
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
||||
let mut highlighted = false;
|
||||
|
||||
for range in ranges.windows(2) {
|
||||
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
|
||||
if highlighted {
|
||||
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
||||
}
|
||||
write!(&mut stdout, "{}", &text[start..end])?;
|
||||
stdout.reset()?;
|
||||
highlighted = !highlighted;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
|
||||
let mut byte_index = 0;
|
||||
let mut byte_length = 0;
|
||||
|
||||
for (n, (i, c)) in text.char_indices().enumerate() {
|
||||
if n == index {
|
||||
byte_index = i;
|
||||
}
|
||||
|
||||
if n + 1 == index + length {
|
||||
byte_length = i - byte_index + c.len_utf8();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
(byte_index, byte_length)
|
||||
}
|
||||
|
||||
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
|
||||
let mut byte_indexes = BTreeMap::new();
|
||||
|
||||
for match_ in matches {
|
||||
let match_attribute = match_.attribute;
|
||||
if SchemaAttr::new(match_attribute) == attribute {
|
||||
let char_index = match_.char_index as usize;
|
||||
let char_length = match_.char_length as usize;
|
||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||
|
||||
match byte_indexes.entry(byte_index) {
|
||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||
Entry::Occupied(mut entry) => {
|
||||
if *entry.get() < byte_length {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut title_areas = Vec::new();
|
||||
title_areas.push(0);
|
||||
for (byte_index, length) in byte_indexes {
|
||||
title_areas.push(byte_index);
|
||||
title_areas.push(byte_index + length);
|
||||
}
|
||||
title_areas.push(text.len());
|
||||
title_areas.sort_unstable();
|
||||
title_areas
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
|
||||
@ -41,26 +114,53 @@ fn main() -> Result<(), Box<Error>> {
|
||||
io::stdout().flush()?;
|
||||
|
||||
if input.read_line(&mut buffer)? == 0 { break }
|
||||
let query = buffer.trim_end_matches('\n');
|
||||
|
||||
let view = database.view();
|
||||
let view = database.view("default")?;
|
||||
let schema = view.schema();
|
||||
|
||||
let (elapsed, documents) = elapsed::measure_time(|| {
|
||||
let builder = view.query_builder().unwrap();
|
||||
builder.query(&buffer, 0..opt.number_results)
|
||||
builder.query(query, 0..opt.number_results)
|
||||
});
|
||||
|
||||
let mut full_documents = Vec::with_capacity(documents.len());
|
||||
let number_of_documents = documents.len();
|
||||
for doc in documents {
|
||||
match view.document_by_id::<Document>(doc.id) {
|
||||
Ok(document) => {
|
||||
for name in &opt.displayed_fields {
|
||||
let attr = match schema.attribute(name) {
|
||||
Some(attr) => attr,
|
||||
None => continue,
|
||||
};
|
||||
let text = match document.get(name) {
|
||||
Some(text) => text,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
for document in documents {
|
||||
match view.retrieve_document::<Document>(document.id) {
|
||||
Ok(document) => full_documents.push(document),
|
||||
print!("{}: ", name);
|
||||
let areas = create_highlight_areas(&text, &doc.matches, attr);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
}
|
||||
},
|
||||
Err(e) => eprintln!("{}", e),
|
||||
}
|
||||
|
||||
let mut matching_attributes = HashSet::new();
|
||||
for _match in doc.matches {
|
||||
let attr = SchemaAttr::new(_match.attribute);
|
||||
let name = schema.attribute_name(attr);
|
||||
matching_attributes.insert(name);
|
||||
}
|
||||
|
||||
let matching_attributes = Vec::from_iter(matching_attributes);
|
||||
println!("matching in: {:?}", matching_attributes);
|
||||
|
||||
println!();
|
||||
}
|
||||
|
||||
println!("{:#?}", full_documents);
|
||||
println!("Found {} results in {}", full_documents.len(), elapsed);
|
||||
|
||||
eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
|
19
examples/schema-example.toml
Normal file
19
examples/schema-example.toml
Normal file
@ -0,0 +1,19 @@
|
||||
# This schema has been generated ...
|
||||
# The order in which the attributes are declared is important,
|
||||
# it specify the attribute xxx...
|
||||
|
||||
identifier = "id"
|
||||
|
||||
[attributes.id]
|
||||
stored = true
|
||||
|
||||
[attributes.title]
|
||||
stored = true
|
||||
indexed = true
|
||||
|
||||
[attributes.description]
|
||||
stored = true
|
||||
indexed = true
|
||||
|
||||
[attributes.image]
|
||||
stored = true
|
@ -95,7 +95,8 @@ or
|
||||
other
|
||||
ought
|
||||
our
|
||||
ours ourselves
|
||||
ours
|
||||
ourselves
|
||||
out
|
||||
over
|
||||
own
|
||||
|
163
misc/fr.stopwords.txt
Normal file
163
misc/fr.stopwords.txt
Normal file
@ -0,0 +1,163 @@
|
||||
au
|
||||
aux
|
||||
avec
|
||||
ce
|
||||
ces
|
||||
dans
|
||||
de
|
||||
des
|
||||
du
|
||||
elle
|
||||
en
|
||||
et
|
||||
eux
|
||||
il
|
||||
je
|
||||
la
|
||||
le
|
||||
leur
|
||||
lui
|
||||
ma
|
||||
mais
|
||||
me
|
||||
même
|
||||
mes
|
||||
moi
|
||||
mon
|
||||
ne
|
||||
nos
|
||||
notre
|
||||
nous
|
||||
on
|
||||
ou
|
||||
par
|
||||
pas
|
||||
pour
|
||||
qu
|
||||
que
|
||||
qui
|
||||
sa
|
||||
se
|
||||
ses
|
||||
son
|
||||
sur
|
||||
ta
|
||||
te
|
||||
tes
|
||||
toi
|
||||
ton
|
||||
tu
|
||||
un
|
||||
une
|
||||
vos
|
||||
votre
|
||||
vous
|
||||
c
|
||||
d
|
||||
j
|
||||
l
|
||||
Ã
|
||||
m
|
||||
n
|
||||
s
|
||||
t
|
||||
y
|
||||
été
|
||||
étée
|
||||
étées
|
||||
étés
|
||||
étant
|
||||
suis
|
||||
es
|
||||
est
|
||||
sommes
|
||||
êtes
|
||||
sont
|
||||
serai
|
||||
seras
|
||||
sera
|
||||
serons
|
||||
serez
|
||||
seront
|
||||
serais
|
||||
serait
|
||||
serions
|
||||
seriez
|
||||
seraient
|
||||
étais
|
||||
était
|
||||
étions
|
||||
étiez
|
||||
étaient
|
||||
fus
|
||||
fut
|
||||
fûmes
|
||||
fûtes
|
||||
furent
|
||||
sois
|
||||
soit
|
||||
soyons
|
||||
soyez
|
||||
soient
|
||||
fusse
|
||||
fusses
|
||||
fût
|
||||
fussions
|
||||
fussiez
|
||||
fussent
|
||||
ayant
|
||||
eu
|
||||
eue
|
||||
eues
|
||||
eus
|
||||
ai
|
||||
as
|
||||
avons
|
||||
avez
|
||||
ont
|
||||
aurai
|
||||
auras
|
||||
aura
|
||||
aurons
|
||||
aurez
|
||||
auront
|
||||
aurais
|
||||
aurait
|
||||
aurions
|
||||
auriez
|
||||
auraient
|
||||
avais
|
||||
avait
|
||||
avions
|
||||
aviez
|
||||
avaient
|
||||
eut
|
||||
eûmes
|
||||
eûtes
|
||||
eurent
|
||||
aie
|
||||
aies
|
||||
ait
|
||||
ayons
|
||||
ayez
|
||||
aient
|
||||
eusse
|
||||
eusses
|
||||
eût
|
||||
eussions
|
||||
eussiez
|
||||
eussent
|
||||
ceci
|
||||
celÃ
|
||||
cet
|
||||
cette
|
||||
ici
|
||||
ils
|
||||
les
|
||||
leurs
|
||||
quel
|
||||
quels
|
||||
quelle
|
||||
quelles
|
||||
sans
|
||||
soi
|
@ -50,6 +50,7 @@ impl AutomatonExt for DfaExt {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum PrefixSetting {
|
||||
Prefix,
|
||||
NoPrefix,
|
||||
|
@ -1,59 +1,54 @@
|
||||
use std::io::{self, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::{io, mem};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct DocIds {
|
||||
data: Data,
|
||||
}
|
||||
pub struct DocIds(SharedData);
|
||||
|
||||
impl DocIds {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
let data = Data::Mmap(mmap);
|
||||
Ok(DocIds { data })
|
||||
pub fn new(ids: &Set<DocumentId>) -> DocIds {
|
||||
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
|
||||
let data = SharedData::from_bytes(bytes.to_vec());
|
||||
DocIds(data)
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
// FIXME check if modulo DocumentId
|
||||
let len = vec.len();
|
||||
let data = Data::Shared {
|
||||
bytes: Arc::new(vec),
|
||||
offset: 0,
|
||||
len: len
|
||||
};
|
||||
Ok(DocIds { data })
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let doc_ids = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIds(doc_ids))
|
||||
}
|
||||
|
||||
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
|
||||
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
|
||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||
// FIXME prefer using the sdset::exponential_search function
|
||||
self.doc_ids().binary_search(&doc).is_ok()
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn doc_ids(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.data;
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for DocIds {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.0;
|
||||
let ptr = slice.as_ptr() as *const DocumentId;
|
||||
let len = slice.len() / mem::size_of::<DocumentId>();
|
||||
let len = slice.len() / size_of::<DocumentId>();
|
||||
let slice = unsafe { from_raw_parts(ptr, len) };
|
||||
Set::new_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for DocIds {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
self.data.as_ref().serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
@ -1,16 +1,15 @@
|
||||
use std::io::{self, Write, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::io::{self, Write};
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::raw::MmapReadOnly;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[repr(C)]
|
||||
@ -21,52 +20,45 @@ struct Range {
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct DocIndexes {
|
||||
ranges: Data,
|
||||
indexes: Data,
|
||||
ranges: SharedData,
|
||||
indexes: SharedData,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
DocIndexes::from_data(Data::Mmap(mmap))
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
|
||||
let bytes = Arc::new(bytes);
|
||||
let len = bytes.len();
|
||||
let data = SharedData::new(bytes, 0, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
DocIndexes::from_cursor(&mut cursor)
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let len = vec.len();
|
||||
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
|
||||
}
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let ranges = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
|
||||
let data = Data::Shared { bytes, offset, len };
|
||||
DocIndexes::from_data(data)
|
||||
}
|
||||
|
||||
fn from_data(data: Data) -> io::Result<Self> {
|
||||
let ranges_len_offset = data.len() - size_of::<u64>();
|
||||
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||
let ranges_len = ranges_len as usize;
|
||||
|
||||
let ranges_offset = ranges_len_offset - ranges_len;
|
||||
let ranges = data.range(ranges_offset, ranges_len);
|
||||
|
||||
let indexes = data.range(0, ranges_offset);
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let indexes = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn to_vec(&self) -> Vec<u8> {
|
||||
let capacity = self.indexes.len() + self.ranges.len() + size_of::<u64>();
|
||||
let mut bytes = Vec::with_capacity(capacity);
|
||||
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
bytes.write_u64::<LittleEndian>(self.ranges.len() as u64).unwrap();
|
||||
|
||||
bytes
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||
self.ranges().get(index as usize).map(|Range { start, end }| {
|
||||
self.ranges().get(index).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
let end = *end as usize;
|
||||
let slice = &self.indexes()[start..end];
|
||||
@ -102,12 +94,17 @@ impl Index<usize> for DocIndexes {
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
indexes: Vec<DocIndex>,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl DocIndexesBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
DocIndexesBuilder::new(Vec::new())
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -115,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
|
||||
let len = indexes.len() as u64;
|
||||
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
||||
let range = Range { start, end: start + len };
|
||||
self.ranges.push(range);
|
||||
|
||||
// write the values
|
||||
let indexes = unsafe { into_u8_slice(indexes) };
|
||||
self.wtr.write_all(indexes)
|
||||
self.indexes.extend_from_slice(indexes);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
@ -135,40 +131,55 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
// write the ranges
|
||||
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
// write the length of the ranges
|
||||
let ranges = unsafe { into_u8_slice(&self.ranges) };
|
||||
let len = ranges.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
let indexes = unsafe { into_u8_slice(&self.indexes) };
|
||||
let len = indexes.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(indexes)?;
|
||||
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
use crate::DocumentId;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(bytes)?;
|
||||
@ -183,19 +194,39 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let builder_bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
|
||||
let bytes = docs.to_vec();
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
docs.write_to_bytes(&mut bytes);
|
||||
|
||||
assert_eq!(builder_bytes, bytes);
|
||||
|
||||
|
@ -1,51 +1,43 @@
|
||||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::raw::MmapReadOnly;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Data {
|
||||
Shared {
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
},
|
||||
Mmap(MmapReadOnly),
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl Data {
|
||||
pub fn range(&self, off: usize, l: usize) -> Data {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
assert!(off + l <= *len);
|
||||
Data::Shared {
|
||||
bytes: bytes.clone(),
|
||||
offset: offset + off,
|
||||
len: l,
|
||||
}
|
||||
},
|
||||
Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)),
|
||||
impl SharedData {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::new(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Data {
|
||||
fn default() -> Data {
|
||||
Data::Shared {
|
||||
bytes: Arc::default(),
|
||||
offset: 0,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Data {
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@ -53,13 +45,14 @@ impl Deref for Data {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for Data {
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
&bytes[*offset..offset + len]
|
||||
},
|
||||
Data::Mmap(m) => m.as_slice(),
|
||||
}
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
@ -1,110 +0,0 @@
|
||||
mod ops;
|
||||
pub mod positive;
|
||||
pub mod negative;
|
||||
|
||||
pub use self::positive::{PositiveBlob, PositiveBlobBuilder};
|
||||
pub use self::negative::NegativeBlob;
|
||||
pub use self::ops::OpBuilder;
|
||||
|
||||
use std::fmt;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Blob {
|
||||
Positive(PositiveBlob),
|
||||
Negative(NegativeBlob),
|
||||
}
|
||||
|
||||
impl Blob {
|
||||
pub fn is_negative(&self) -> bool {
|
||||
self.sign() == Sign::Negative
|
||||
}
|
||||
|
||||
pub fn is_positive(&self) -> bool {
|
||||
self.sign() == Sign::Positive
|
||||
}
|
||||
|
||||
pub fn sign(&self) -> Sign {
|
||||
match self {
|
||||
Blob::Positive(_) => Sign::Positive,
|
||||
Blob::Negative(_) => Sign::Negative,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Blob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
match self {
|
||||
Blob::Positive(blob) => {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&Sign::Positive)?;
|
||||
tuple.serialize_element(&blob)?;
|
||||
tuple.end()
|
||||
},
|
||||
Blob::Negative(blob) => {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&Sign::Negative)?;
|
||||
tuple.serialize_element(&blob)?;
|
||||
tuple.end()
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Blob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Blob, D::Error> {
|
||||
struct TupleVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for TupleVisitor {
|
||||
type Value = Blob;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a Blob struct")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
|
||||
let sign = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(0, &self)),
|
||||
};
|
||||
match sign {
|
||||
Sign::Positive => {
|
||||
let blob = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
Ok(Blob::Positive(blob))
|
||||
},
|
||||
Sign::Negative => {
|
||||
let blob = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
Ok(Blob::Negative(blob))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_tuple(2, TupleVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum Sign {
|
||||
Positive,
|
||||
Negative,
|
||||
}
|
||||
|
||||
impl Sign {
|
||||
pub fn invert(self) -> Sign {
|
||||
match self {
|
||||
Sign::Positive => Sign::Negative,
|
||||
Sign::Negative => Sign::Positive,
|
||||
}
|
||||
}
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::fmt;
|
||||
|
||||
use sdset::Set;
|
||||
use serde::de::{self, Deserialize, Deserializer};
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct NegativeBlob {
|
||||
doc_ids: DocIds,
|
||||
}
|
||||
|
||||
impl NegativeBlob {
|
||||
pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
{
|
||||
let doc_ids = DocIds::from_path(doc_ids)?;
|
||||
Ok(NegativeBlob { doc_ids })
|
||||
}
|
||||
|
||||
pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let doc_ids = DocIds::from_bytes(doc_ids)?;
|
||||
Ok(NegativeBlob { doc_ids })
|
||||
}
|
||||
|
||||
pub fn from_raw(doc_ids: DocIds) -> Self {
|
||||
NegativeBlob { doc_ids }
|
||||
}
|
||||
|
||||
pub fn as_ids(&self) -> &DocIds {
|
||||
&self.doc_ids
|
||||
}
|
||||
|
||||
pub fn into_doc_ids(self) -> DocIds {
|
||||
self.doc_ids
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for NegativeBlob {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
self.as_ids().doc_ids()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for NegativeBlob {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "NegativeBlob(")?;
|
||||
f.debug_list().entries(self.as_ref().as_slice()).finish()?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for NegativeBlob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
self.doc_ids.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for NegativeBlob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<NegativeBlob, D::Error> {
|
||||
let bytes = Vec::deserialize(deserializer)?;
|
||||
NegativeBlob::from_bytes(bytes).map_err(de::Error::custom)
|
||||
}
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
mod blob;
|
||||
mod ops;
|
||||
|
||||
pub use self::blob::NegativeBlob;
|
||||
pub use self::ops::OpBuilder;
|
@ -1,73 +0,0 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::blob::NegativeBlob;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct OpBuilder<'a> {
|
||||
inner: SdOpBuilder<'a, DocumentId>,
|
||||
}
|
||||
|
||||
/// Do a set operation on multiple negative blobs.
|
||||
impl<'a> OpBuilder<'a> {
|
||||
pub fn new() -> Self {
|
||||
Self { inner: SdOpBuilder::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> Self {
|
||||
Self { inner: SdOpBuilder::with_capacity(cap) }
|
||||
}
|
||||
|
||||
pub fn add(mut self, blob: &'a NegativeBlob) -> Self {
|
||||
self.push(blob);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: &'a NegativeBlob) {
|
||||
let set = Set::new_unchecked(blob.as_ref());
|
||||
self.inner.push(set);
|
||||
}
|
||||
|
||||
pub fn union(self) -> Union<'a> {
|
||||
Union::new(self.inner.union())
|
||||
}
|
||||
|
||||
pub fn intersection(self) -> Intersection<'a> {
|
||||
Intersection::new(self.inner.intersection())
|
||||
}
|
||||
|
||||
pub fn difference(self) -> Difference<'a> {
|
||||
Difference::new(self.inner.difference())
|
||||
}
|
||||
|
||||
pub fn symmetric_difference(self) -> SymmetricDifference<'a> {
|
||||
SymmetricDifference::new(self.inner.symmetric_difference())
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! logical_operation {
|
||||
(struct $name:ident, $operation:ident) => {
|
||||
|
||||
pub struct $name<'a> {
|
||||
op: sdset::multi::$name<'a, DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> $name<'a> {
|
||||
fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self {
|
||||
$name { op }
|
||||
}
|
||||
|
||||
pub fn into_negative_blob(self) -> NegativeBlob {
|
||||
let document_ids = sdset::SetOperation::into_set_buf(self.op);
|
||||
let doc_ids = DocIds::from_document_ids(document_ids.into_vec());
|
||||
NegativeBlob::from_raw(doc_ids)
|
||||
}
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
logical_operation!(struct Union, union);
|
||||
logical_operation!(struct Intersection, intersection);
|
||||
logical_operation!(struct Difference, difference);
|
||||
logical_operation!(struct SymmetricDifference, symmetric_difference);
|
@ -1,109 +0,0 @@
|
||||
use std::error::Error;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use group_by::GroupBy;
|
||||
|
||||
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
|
||||
use crate::database::blob::{positive, negative};
|
||||
|
||||
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
|
||||
a.sign() == b.sign()
|
||||
}
|
||||
|
||||
fn unwrap_positive(blob: &Blob) -> &PositiveBlob {
|
||||
match blob {
|
||||
Blob::Positive(blob) => blob,
|
||||
Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"),
|
||||
}
|
||||
}
|
||||
|
||||
fn unwrap_negative(blob: &Blob) -> &NegativeBlob {
|
||||
match blob {
|
||||
Blob::Negative(blob) => blob,
|
||||
Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OpBuilder {
|
||||
blobs: Vec<Blob>,
|
||||
}
|
||||
|
||||
impl OpBuilder {
|
||||
pub fn new() -> OpBuilder {
|
||||
OpBuilder { blobs: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> OpBuilder {
|
||||
OpBuilder { blobs: Vec::with_capacity(cap) }
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: Blob) {
|
||||
if self.blobs.is_empty() && blob.is_negative() { return }
|
||||
self.blobs.push(blob);
|
||||
}
|
||||
|
||||
pub fn merge(self) -> Result<PositiveBlob, Box<Error>> {
|
||||
let groups = GroupBy::new(&self.blobs, blob_same_sign);
|
||||
let mut aggregated = Vec::new();
|
||||
|
||||
for blobs in groups {
|
||||
match blobs[0].sign() {
|
||||
Sign::Positive => {
|
||||
let mut op_builder = positive::OpBuilder::with_capacity(blobs.len());
|
||||
for blob in blobs {
|
||||
op_builder.push(unwrap_positive(blob));
|
||||
}
|
||||
|
||||
let mut stream = op_builder.union().into_stream();
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
while let Some((input, doc_indexes)) = stream.next() {
|
||||
// FIXME empty doc_indexes must be handled by OpBuilder
|
||||
if !doc_indexes.is_empty() {
|
||||
builder.insert(input, doc_indexes).unwrap();
|
||||
}
|
||||
}
|
||||
let (map, doc_indexes) = builder.into_inner().unwrap();
|
||||
let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap();
|
||||
aggregated.push(Blob::Positive(blob));
|
||||
},
|
||||
Sign::Negative => {
|
||||
let mut op_builder = negative::OpBuilder::with_capacity(blobs.len());
|
||||
for blob in blobs {
|
||||
op_builder.push(unwrap_negative(blob));
|
||||
}
|
||||
let blob = op_builder.union().into_negative_blob();
|
||||
aggregated.push(Blob::Negative(blob));
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| {
|
||||
let negative = NegativeBlob::default();
|
||||
let (positive, negative) = match slice {
|
||||
[a, b] => (unwrap_positive(a), unwrap_negative(b)),
|
||||
[a] => (unwrap_positive(a), &negative),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
let op_builder = positive::OpBuilder::new().add(&base).add(&positive);
|
||||
let mut stream = op_builder.union().into_stream();
|
||||
while let Some((input, doc_indexes)) = stream.next() {
|
||||
let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x);
|
||||
|
||||
buffer.clear();
|
||||
op.extend_vec(&mut buffer);
|
||||
if !buffer.is_empty() {
|
||||
builder.insert(input, Set::new_unchecked(&buffer))?;
|
||||
}
|
||||
}
|
||||
|
||||
let (map, doc_indexes) = builder.into_inner()?;
|
||||
PositiveBlob::from_bytes(map, doc_indexes)
|
||||
})
|
||||
}
|
||||
}
|
@ -1,254 +0,0 @@
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::error::Error;
|
||||
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct PositiveBlob {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl PositiveBlob {
|
||||
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
Q: AsRef<Path>,
|
||||
{
|
||||
let map = Map::from_path(map)?;
|
||||
let indexes = DocIndexes::from_path(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
|
||||
PositiveBlob { map, indexes }
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
|
||||
self.map.get(key).map(|index| &self.indexes[index as usize])
|
||||
}
|
||||
|
||||
pub fn as_map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn as_indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn explode(self) -> (Map, DocIndexes) {
|
||||
(self.map, self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PositiveBlob {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "PositiveBlob([")?;
|
||||
let mut stream = self.into_stream();
|
||||
let mut first = true;
|
||||
while let Some((k, v)) = stream.next() {
|
||||
if !first {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
first = false;
|
||||
write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?;
|
||||
}
|
||||
write!(f, "])")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob {
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
/// The type of the stream to be constructed.
|
||||
type Into = PositiveBlobStream<'m>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into {
|
||||
PositiveBlobStream {
|
||||
map_stream: self.map.into_stream(),
|
||||
doc_indexes: &self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBlobStream<'m> {
|
||||
map_stream: map::Stream<'m>,
|
||||
doc_indexes: &'m DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> {
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.map_stream.next() {
|
||||
Some((input, index)) => {
|
||||
let doc_indexes = &self.doc_indexes[index as usize];
|
||||
Some((input, doc_indexes))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for PositiveBlob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&self.map.as_fst().to_vec())?;
|
||||
tuple.serialize_element(&self.indexes.to_vec())?;
|
||||
tuple.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for PositiveBlob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<PositiveBlob, D::Error> {
|
||||
struct TupleVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for TupleVisitor {
|
||||
type Value = PositiveBlob;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a PositiveBlob struct")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
|
||||
let map = match seq.next_element()? {
|
||||
Some(bytes) => match Map::from_bytes(bytes) {
|
||||
Ok(value) => value,
|
||||
Err(err) => return Err(de::Error::custom(err)),
|
||||
},
|
||||
None => return Err(de::Error::invalid_length(0, &self)),
|
||||
};
|
||||
|
||||
let indexes = match seq.next_element()? {
|
||||
Some(bytes) => match DocIndexes::from_bytes(bytes) {
|
||||
Ok(value) => value,
|
||||
Err(err) => return Err(de::Error::custom(err)),
|
||||
},
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_tuple(2, TupleVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBlobBuilder<W, X> {
|
||||
map: fst::MapBuilder<W>,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
PositiveBlobBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
|
||||
pub fn new(map: W, indexes: X) -> Result<Self, Box<Error>> {
|
||||
Ok(PositiveBlobBuilder {
|
||||
map: fst::MapBuilder::new(map)?,
|
||||
indexes: DocIndexesBuilder::new(indexes),
|
||||
value: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, doc_indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(doc_indexes)?;
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
builder.insert("aaa", Set::new(&[a])?)?;
|
||||
builder.insert("aab", Set::new(&[a, b, c])?)?;
|
||||
builder.insert("aac", Set::new(&[a, c])?)?;
|
||||
|
||||
let (map_bytes, indexes_bytes) = builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
|
||||
|
||||
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
|
||||
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
|
||||
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
|
||||
assert_eq!(positive_blob.get("aad"), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
builder.insert("aaa", Set::new(&[a])?)?;
|
||||
builder.insert("aab", Set::new(&[a, b, c])?)?;
|
||||
builder.insert("aac", Set::new(&[a, c])?)?;
|
||||
|
||||
let (map_bytes, indexes_bytes) = builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
|
||||
|
||||
let bytes = bincode::serialize(&positive_blob)?;
|
||||
let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?;
|
||||
|
||||
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
|
||||
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
|
||||
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
|
||||
assert_eq!(positive_blob.get("aad"), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
mod blob;
|
||||
mod ops;
|
||||
|
||||
pub use self::blob::{PositiveBlob, PositiveBlobBuilder};
|
||||
pub use self::ops::OpBuilder;
|
@ -1,128 +0,0 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::{SetOperation, Set};
|
||||
|
||||
use crate::database::blob::PositiveBlob;
|
||||
use crate::data::DocIndexes;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct OpBuilder<'m> {
|
||||
// the operation on the maps is always an union.
|
||||
map_op: fst::map::OpBuilder<'m>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
}
|
||||
|
||||
/// Do a set operation on multiple positive blobs.
|
||||
impl<'m> OpBuilder<'m> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
map_op: fst::map::OpBuilder::new(),
|
||||
indexes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> Self {
|
||||
Self {
|
||||
map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity
|
||||
indexes: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(mut self, blob: &'m PositiveBlob) -> Self {
|
||||
self.push(blob);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: &'m PositiveBlob) {
|
||||
self.map_op.push(blob.as_map());
|
||||
self.indexes.push(blob.as_indexes());
|
||||
}
|
||||
|
||||
pub fn union(self) -> Union<'m> {
|
||||
Union::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn intersection(self) -> Intersection<'m> {
|
||||
Intersection::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn difference(self) -> Difference<'m> {
|
||||
Difference::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
|
||||
SymmetricDifference::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! logical_operation {
|
||||
(struct $name:ident, $operation:ident) => {
|
||||
|
||||
pub struct $name<'m> {
|
||||
stream: fst::map::Union<'m>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
outs: Vec<DocIndex>,
|
||||
}
|
||||
|
||||
impl<'m> $name<'m> {
|
||||
fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self {
|
||||
$name {
|
||||
stream: stream,
|
||||
indexes: indexes,
|
||||
outs: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
// loop {
|
||||
// let (input, ivalues) = match self.stream.next() {
|
||||
// Some(value) => value,
|
||||
// None => return None,
|
||||
// };
|
||||
|
||||
// self.outs.clear();
|
||||
|
||||
// let mut builder = SdOpBuilder::with_capacity(ivalues.len());
|
||||
// for ivalue in ivalues {
|
||||
// let indexes = self.indexes[ivalue.index];
|
||||
// let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes");
|
||||
// let set = Set::new_unchecked(indexes);
|
||||
// builder.push(set);
|
||||
// }
|
||||
|
||||
// builder.$operation().extend_vec(&mut self.outs);
|
||||
|
||||
// if self.outs.is_empty() { continue }
|
||||
// return Some((input, &self.outs))
|
||||
// }
|
||||
|
||||
// FIXME make the above code compile
|
||||
match self.stream.next() {
|
||||
Some((input, ivalues)) => {
|
||||
self.outs.clear();
|
||||
|
||||
let mut builder = SdOpBuilder::with_capacity(ivalues.len());
|
||||
for ivalue in ivalues {
|
||||
let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize];
|
||||
let set = Set::new_unchecked(doc_indexes);
|
||||
builder.push(set);
|
||||
}
|
||||
|
||||
builder.$operation().extend_vec(&mut self.outs);
|
||||
|
||||
if self.outs.is_empty() { return None }
|
||||
return Some((input, Set::new_unchecked(&self.outs)))
|
||||
},
|
||||
None => None
|
||||
}
|
||||
}
|
||||
}
|
||||
}}
|
||||
|
||||
logical_operation!(struct Union, union);
|
||||
logical_operation!(struct Intersection, intersection);
|
||||
logical_operation!(struct Difference, difference);
|
||||
logical_operation!(struct SymmetricDifference, symmetric_difference);
|
@ -2,13 +2,13 @@ use std::io::{Cursor, Read, Write};
|
||||
use std::mem::size_of;
|
||||
use std::fmt;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt};
|
||||
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
|
||||
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::DocumentId;
|
||||
|
||||
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
|
||||
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u32>();
|
||||
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentKey([u8; DOC_KEY_LEN]);
|
||||
@ -19,7 +19,7 @@ impl DocumentKey {
|
||||
|
||||
let mut wtr = Cursor::new(&mut buffer[..]);
|
||||
wtr.write_all(b"doc-").unwrap();
|
||||
wtr.write_u64::<NativeEndian>(id).unwrap();
|
||||
wtr.write_u64::<BigEndian>(id.0).unwrap();
|
||||
|
||||
DocumentKey(buffer)
|
||||
}
|
||||
@ -38,12 +38,17 @@ impl DocumentKey {
|
||||
DocumentKeyAttr::new(self.document_id(), attr)
|
||||
}
|
||||
|
||||
pub fn with_attribute_min(&self) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(self.document_id(), SchemaAttr::min())
|
||||
}
|
||||
|
||||
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
|
||||
}
|
||||
|
||||
pub fn document_id(&self) -> DocumentId {
|
||||
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
|
||||
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
|
||||
DocumentId(id)
|
||||
}
|
||||
}
|
||||
|
||||
@ -72,11 +77,19 @@ impl DocumentKeyAttr {
|
||||
let mut wtr = Cursor::new(&mut buffer[..]);
|
||||
wtr.write_all(&raw_key).unwrap();
|
||||
wtr.write_all(b"-").unwrap();
|
||||
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap();
|
||||
wtr.write_u16::<BigEndian>(attr.0).unwrap();
|
||||
|
||||
DocumentKeyAttr(buffer)
|
||||
}
|
||||
|
||||
pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(id, SchemaAttr::min())
|
||||
}
|
||||
|
||||
pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(id, SchemaAttr::max())
|
||||
}
|
||||
|
||||
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
|
||||
assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
|
||||
assert_eq!(&bytes[..4], b"doc-");
|
||||
@ -88,12 +101,13 @@ impl DocumentKeyAttr {
|
||||
}
|
||||
|
||||
pub fn document_id(&self) -> DocumentId {
|
||||
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
|
||||
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
|
||||
DocumentId(id)
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> SchemaAttr {
|
||||
let offset = 4 + size_of::<u64>() + 1;
|
||||
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap();
|
||||
let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
|
||||
SchemaAttr::new(value)
|
||||
}
|
||||
|
||||
@ -112,7 +126,24 @@ impl fmt::Debug for DocumentKeyAttr {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("DocumentKeyAttr")
|
||||
.field("document_id", &self.document_id())
|
||||
.field("attribute", &self.attribute().as_u32())
|
||||
.field("attribute", &self.attribute().0)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn keep_as_ref_order() {
|
||||
for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
|
||||
let id = DocumentId(0);
|
||||
let a = DocumentKeyAttr::new(id, SchemaAttr(a));
|
||||
let b = DocumentKeyAttr::new(id, SchemaAttr(b));
|
||||
|
||||
assert!(a < b);
|
||||
assert!(a.as_ref() < b.as_ref());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
82
src/database/index/mod.rs
Normal file
82
src/database/index/mod.rs
Normal file
@ -0,0 +1,82 @@
|
||||
mod negative;
|
||||
mod positive;
|
||||
|
||||
pub(crate) use self::negative::Negative;
|
||||
pub(crate) use self::positive::{Positive, PositiveBuilder};
|
||||
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use fst::Map;
|
||||
|
||||
use crate::data::{SharedData, DocIndexes};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Index {
|
||||
pub(crate) negative: Negative,
|
||||
pub(crate) positive: Positive,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
|
||||
let len = bytes.len();
|
||||
Index::from_shared_bytes(Arc::new(bytes), 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<Index, Box<Error>>
|
||||
{
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
|
||||
let negative = Negative::from_cursor(&mut cursor)?;
|
||||
let positive = Positive::from_cursor(&mut cursor)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
self.negative.write_to_bytes(bytes);
|
||||
self.positive.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
|
||||
if other.negative.is_empty() {
|
||||
let negative = Negative::default();
|
||||
let positive = self.positive.union(&other.positive)?;
|
||||
return Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = self.positive.into_stream();
|
||||
while let Some((key, indexes)) = stream.next() {
|
||||
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
|
||||
|
||||
buffer.clear();
|
||||
op.extend_vec(&mut buffer);
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
}
|
||||
|
||||
let positive = {
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let negative = Negative::default();
|
||||
let positive = positive.union(&other.positive)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
}
|
43
src/database/index/negative.rs
Normal file
43
src/database/index/negative.rs
Normal file
@ -0,0 +1,43 @@
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::ops::Deref;
|
||||
|
||||
use sdset::Set;
|
||||
use byteorder::{LittleEndian, WriteBytesExt};
|
||||
|
||||
use crate::data::SharedData;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Negative(DocIds);
|
||||
|
||||
impl Negative {
|
||||
pub fn new(doc_ids: DocIds) -> Negative {
|
||||
Negative(doc_ids)
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
|
||||
let doc_ids = DocIds::from_cursor(cursor)?;
|
||||
Ok(Negative(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.0.as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Negative {
|
||||
type Target = Set<DocumentId>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
166
src/database/index/positive.rs
Normal file
166
src/database/index/positive.rs
Normal file
@ -0,0 +1,166 @@
|
||||
use std::io::{Write, BufRead, Cursor};
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
use sdset::{Set, SetOperation};
|
||||
use sdset::duo::Union;
|
||||
use fst::raw::Fst;
|
||||
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Positive {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl Positive {
|
||||
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
|
||||
Positive { map, indexes }
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let data = cursor.get_ref().range(offset, len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
cursor.consume(len);
|
||||
|
||||
let indexes = DocIndexes::from_cursor(cursor)?;
|
||||
|
||||
Ok(Positive { map, indexes})
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
while let Some((key, ivalues)) = stream.next() {
|
||||
buffer.clear();
|
||||
match ivalues {
|
||||
[a, b] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
let a = Set::new_unchecked(indexes);
|
||||
|
||||
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
|
||||
let b = Set::new_unchecked(indexes);
|
||||
|
||||
let op = Union::new(a, b);
|
||||
op.extend_vec(&mut buffer);
|
||||
},
|
||||
[a] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
buffer.extend_from_slice(indexes)
|
||||
},
|
||||
_ => continue,
|
||||
}
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(Positive { map, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
/// The type of the stream to be constructed.
|
||||
type Into = Stream<'m>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into {
|
||||
Stream {
|
||||
map_stream: self.map.into_stream(),
|
||||
indexes: &self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Stream<'m> {
|
||||
map_stream: map::Stream<'m>,
|
||||
indexes: &'m DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'a> Streamer<'a> for Stream<'m> {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.map_stream.next() {
|
||||
Some((input, index)) => {
|
||||
let indexes = &self.indexes[index as usize];
|
||||
let indexes = Set::new_unchecked(indexes);
|
||||
Some((input, indexes))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBuilder<W, X> {
|
||||
map: fst::MapBuilder<W>,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
PositiveBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(indexes);
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
@ -1,66 +1,128 @@
|
||||
use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard};
|
||||
use std::sync::Arc;
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::ops::Deref;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
|
||||
use crossbeam::atomic::ArcCell;
|
||||
use log::{info, error, warn};
|
||||
use rocksdb::rocksdb::{Writable, Snapshot};
|
||||
use rocksdb::{DB, DBVector, MergeOperands};
|
||||
use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions};
|
||||
use rocksdb::{DB, MergeOperands};
|
||||
use lockfree::map::Map;
|
||||
|
||||
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
pub use self::database_view::{DatabaseView, DocumentIter};
|
||||
use self::blob::positive::PositiveBlob;
|
||||
use self::update::Update;
|
||||
use self::schema::Schema;
|
||||
use self::blob::Blob;
|
||||
|
||||
pub mod blob;
|
||||
pub mod schema;
|
||||
pub mod update;
|
||||
mod document_key;
|
||||
mod database_view;
|
||||
mod deserializer;
|
||||
pub use self::view::{DatabaseView, DocumentIter};
|
||||
pub use self::update::Update;
|
||||
pub use self::serde::SerializerError;
|
||||
pub use self::schema::Schema;
|
||||
pub use self::index::Index;
|
||||
|
||||
const DATA_INDEX: &[u8] = b"data-index";
|
||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||
|
||||
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
|
||||
pub mod schema;
|
||||
pub(crate) mod index;
|
||||
mod document_key;
|
||||
mod serde;
|
||||
mod update;
|
||||
mod view;
|
||||
|
||||
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
match snapshot.get(DATA_SCHEMA)? {
|
||||
Some(vector) => Ok(Schema::read_from(&*vector)?),
|
||||
Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
|
||||
None => Err(String::from("BUG: no schema found in the database").into()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
|
||||
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
match snapshot.get(DATA_INDEX)? {
|
||||
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||
None => Ok(PositiveBlob::default()),
|
||||
let (elapsed, vector) = elapsed::measure_time(|| snapshot.get(DATA_INDEX));
|
||||
info!("loading index from kv-store took {}", elapsed);
|
||||
|
||||
let index = match vector? {
|
||||
Some(vector) => {
|
||||
let bytes = vector.as_ref().to_vec();
|
||||
info!("index size if {} MiB", bytes.len() / 1024 / 1024);
|
||||
|
||||
let (elapsed, index) = elapsed::measure_time(|| Index::from_bytes(bytes));
|
||||
info!("loading index from bytes took {}", elapsed);
|
||||
index?
|
||||
|
||||
},
|
||||
None => Index::default(),
|
||||
};
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
|
||||
|
||||
let mut index: Option<Index> = None;
|
||||
for bytes in existing.into_iter().chain(operands) {
|
||||
let operand = Index::from_bytes(bytes.to_vec()).unwrap();
|
||||
let merged = match index {
|
||||
Some(ref index) => index.merge(&operand).unwrap(),
|
||||
None => operand,
|
||||
};
|
||||
|
||||
index.replace(merged);
|
||||
}
|
||||
|
||||
let index = index.unwrap_or_default();
|
||||
let mut bytes = Vec::new();
|
||||
index.write_to_bytes(&mut bytes);
|
||||
bytes
|
||||
}
|
||||
|
||||
pub struct IndexUpdate {
|
||||
index: String,
|
||||
update: Update,
|
||||
}
|
||||
|
||||
impl Deref for IndexUpdate {
|
||||
type Target = Update;
|
||||
|
||||
fn deref(&self) -> &Update {
|
||||
&self.update
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Database {
|
||||
// DB is under a Mutex to sync update ingestions and separate DB update locking
|
||||
// and DatabaseView acquiring locking in other words:
|
||||
// "Block readers the minimum possible amount of time"
|
||||
db: Mutex<Arc<DB>>,
|
||||
|
||||
// This view is updated each time the DB ingests an update
|
||||
view: RwLock<DatabaseView<Arc<DB>>>,
|
||||
impl DerefMut for IndexUpdate {
|
||||
fn deref_mut(&mut self) -> &mut Update {
|
||||
&mut self.update
|
||||
}
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Database, Box<Error>> {
|
||||
struct DatabaseIndex {
|
||||
db: Arc<DB>,
|
||||
|
||||
// This view is updated each time the DB ingests an update
|
||||
view: ArcCell<DatabaseView<Arc<DB>>>,
|
||||
|
||||
// This path is the path to the mdb folder stored on disk
|
||||
path: PathBuf,
|
||||
|
||||
// must_die false by default, must be set as true when the Index is dropped.
|
||||
// It's used to erase the folder saved on disk when the user request to delete an index
|
||||
must_die: AtomicBool,
|
||||
}
|
||||
|
||||
impl DatabaseIndex {
|
||||
fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<DatabaseIndex, Box<Error>> {
|
||||
let path = path.as_ref();
|
||||
if path.exists() {
|
||||
return Err(format!("File already exists at path: {}, cannot create database.",
|
||||
path.display()).into())
|
||||
}
|
||||
|
||||
let path = path.to_string_lossy();
|
||||
let path_lossy = path.to_string_lossy();
|
||||
let mut opts = DBOptions::new();
|
||||
opts.create_if_missing(true);
|
||||
// opts.error_if_exists(true); // FIXME pull request that
|
||||
@ -68,21 +130,27 @@ impl Database {
|
||||
let mut cf_opts = ColumnFamilyOptions::new();
|
||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||
|
||||
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
|
||||
|
||||
let mut schema_bytes = Vec::new();
|
||||
schema.write_to(&mut schema_bytes)?;
|
||||
schema.write_to_bin(&mut schema_bytes)?;
|
||||
db.put(DATA_SCHEMA, &schema_bytes)?;
|
||||
|
||||
let db = Arc::new(db);
|
||||
let snapshot = Snapshot::new(db.clone());
|
||||
let view = RwLock::new(DatabaseView::new(snapshot)?);
|
||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
|
||||
Ok(Database { db: Mutex::new(db), view })
|
||||
|
||||
Ok(DatabaseIndex {
|
||||
db: db,
|
||||
view: view,
|
||||
path: path.to_path_buf(),
|
||||
must_die: AtomicBool::new(false)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||
let path = path.as_ref().to_string_lossy();
|
||||
fn open<P: AsRef<Path>>(path: P) -> Result<DatabaseIndex, Box<Error>> {
|
||||
let path_lossy = path.as_ref().to_string_lossy();
|
||||
|
||||
let mut opts = DBOptions::new();
|
||||
opts.create_if_missing(false);
|
||||
@ -90,170 +158,639 @@ impl Database {
|
||||
let mut cf_opts = ColumnFamilyOptions::new();
|
||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||
|
||||
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
|
||||
|
||||
// FIXME create a generic function to do that !
|
||||
let _schema = match db.get(DATA_SCHEMA)? {
|
||||
Some(value) => Schema::read_from(&*value)?,
|
||||
Some(value) => Schema::read_from_bin(&*value)?,
|
||||
None => return Err(String::from("Database does not contain a schema").into()),
|
||||
};
|
||||
|
||||
let db = Arc::new(db);
|
||||
let snapshot = Snapshot::new(db.clone());
|
||||
let view = RwLock::new(DatabaseView::new(snapshot)?);
|
||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
|
||||
Ok(Database { db: Mutex::new(db), view })
|
||||
Ok(DatabaseIndex {
|
||||
db: db,
|
||||
view: view,
|
||||
path: path.as_ref().to_path_buf(),
|
||||
must_die: AtomicBool::new(false)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn ingest_update_file(&self, update: Update) -> Result<(), Box<Error>> {
|
||||
let snapshot = {
|
||||
// We must have a mutex here to ensure that update ingestions and compactions
|
||||
// are done atomatically and in the right order.
|
||||
// This way update ingestions will block other update ingestions without blocking view
|
||||
// creations while doing the "data-index" compaction
|
||||
let db = match self.db.lock() {
|
||||
Ok(db) => db,
|
||||
Err(e) => return Err(e.to_string().into()),
|
||||
};
|
||||
fn must_die(&self) {
|
||||
self.must_die.store(true, Ordering::Relaxed)
|
||||
}
|
||||
|
||||
let move_update = update.can_be_moved();
|
||||
let path = update.into_path_buf();
|
||||
let path = path.to_string_lossy();
|
||||
|
||||
let mut options = IngestExternalFileOptions::new();
|
||||
options.move_files(move_update);
|
||||
|
||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
||||
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
|
||||
|
||||
// Compacting to trigger the merge operator only one time
|
||||
// while ingesting the update and not each time searching
|
||||
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
|
||||
|
||||
Snapshot::new(db.clone())
|
||||
fn start_update(&self) -> Result<Update, Box<Error>> {
|
||||
let schema = match self.db.get(DATA_SCHEMA)? {
|
||||
Some(value) => Schema::read_from_bin(&*value)?,
|
||||
None => panic!("Database does not contain a schema"),
|
||||
};
|
||||
|
||||
// Here we will block the view creation for the minimum amount of time:
|
||||
// updating the DatabaseView itself with the new database snapshot
|
||||
let view = DatabaseView::new(snapshot)?;
|
||||
match self.view.write() {
|
||||
Ok(mut lock) => *lock = view,
|
||||
Err(e) => return Err(e.to_string().into()),
|
||||
Ok(Update::new(schema))
|
||||
}
|
||||
|
||||
fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let batch = update.build()?;
|
||||
self.db.write(batch)?;
|
||||
|
||||
let snapshot = Snapshot::new(self.db.clone());
|
||||
let view = Arc::new(DatabaseView::new(snapshot)?);
|
||||
self.view.set(view.clone());
|
||||
|
||||
Ok(view)
|
||||
}
|
||||
|
||||
fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
|
||||
self.view.get()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DatabaseIndex {
|
||||
fn drop(&mut self) {
|
||||
if self.must_die.load(Ordering::Relaxed) {
|
||||
if let Err(err) = fs::remove_dir_all(&self.path) {
|
||||
error!("Impossible to remove mdb when Database id dropped; {}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Database {
|
||||
indexes: Map<String, Arc<DatabaseIndex>>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||
Ok(Database {
|
||||
indexes: Map::new(),
|
||||
path: path.as_ref().to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||
let entries = fs::read_dir(&path)?;
|
||||
|
||||
let indexes = Map::new();
|
||||
for entry in entries {
|
||||
let path = match entry {
|
||||
Ok(p) => p.path(),
|
||||
Err(err) => {
|
||||
warn!("Impossible to retrieve the path from an entry; {}", err);
|
||||
continue
|
||||
}
|
||||
};
|
||||
|
||||
let name = match path.file_stem().and_then(OsStr::to_str) {
|
||||
Some(name) => name.to_owned(),
|
||||
None => continue
|
||||
};
|
||||
|
||||
let db = match DatabaseIndex::open(path.clone()) {
|
||||
Ok(db) => db,
|
||||
Err(err) => {
|
||||
warn!("Impossible to open the database; {}", err);
|
||||
continue
|
||||
}
|
||||
};
|
||||
|
||||
info!("Load database {}", name);
|
||||
indexes.insert(name, Arc::new(db));
|
||||
}
|
||||
|
||||
Ok(Database {
|
||||
indexes: indexes,
|
||||
path: path.as_ref().to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box<Error>> {
|
||||
let index_path = self.path.join(name);
|
||||
|
||||
if index_path.exists() {
|
||||
return Err("Index already exists".into());
|
||||
}
|
||||
|
||||
let index = DatabaseIndex::create(index_path, schema)?;
|
||||
self.indexes.insert(name.to_owned(), Arc::new(index));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
|
||||
self.view().get(key)
|
||||
pub fn delete_index(&self, name: &str) -> Result<(), Box<Error>> {
|
||||
let index_guard = self.indexes.remove(name).ok_or("Index not found")?;
|
||||
index_guard.val().must_die();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush(&self) -> Result<(), Box<Error>> {
|
||||
match self.db.lock() {
|
||||
Ok(db) => Ok(db.flush(true)?),
|
||||
Err(e) => Err(e.to_string().into()),
|
||||
}
|
||||
pub fn list_indexes(&self) -> Vec<String> {
|
||||
self.indexes.iter().map(|g| g.key().clone()).collect()
|
||||
}
|
||||
|
||||
pub fn view(&self) -> RwLockReadGuard<DatabaseView<Arc<DB>>> {
|
||||
self.view.read().unwrap()
|
||||
}
|
||||
}
|
||||
pub fn start_update(&self, index: &str) -> Result<IndexUpdate, Box<Error>> {
|
||||
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||
let update = index_guard.val().start_update()?;
|
||||
|
||||
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
if key != DATA_INDEX {
|
||||
panic!("The merge operator only supports \"data-index\" merging")
|
||||
Ok(IndexUpdate { index: index.to_owned(), update })
|
||||
}
|
||||
|
||||
let capacity = {
|
||||
let remaining = operands.size_hint().0;
|
||||
let already_exist = usize::from(existing_value.is_some());
|
||||
remaining + already_exist
|
||||
};
|
||||
pub fn commit_update(&self, update: IndexUpdate)-> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?;
|
||||
|
||||
let mut op = blob::OpBuilder::with_capacity(capacity);
|
||||
if let Some(existing_value) = existing_value {
|
||||
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
|
||||
op.push(Blob::Positive(blob));
|
||||
index_guard.val().commit_update(update.update)
|
||||
}
|
||||
|
||||
for bytes in operands {
|
||||
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
|
||||
op.push(blob);
|
||||
pub fn view(&self, index: &str) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||
|
||||
Ok(index_guard.val().view())
|
||||
}
|
||||
|
||||
let blob = op.merge().expect("BUG: could not merge blobs");
|
||||
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::collections::HashSet;
|
||||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::update::PositiveUpdateBuilder;
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn ingest_update_file() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
fn ingest_one_easy_update() -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
let meilidb_path = dir.path().join("meilidb.mdb");
|
||||
let meilidb_index_name = "default";
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct SimpleDoc {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
timestamp: u64,
|
||||
}
|
||||
|
||||
let schema = {
|
||||
let mut builder = SchemaBuilder::new();
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("id", STORED);
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
builder.new_attribute("timestamp", STORED);
|
||||
builder.build()
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, schema.clone())?;
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let database = Database::create(&meilidb_path)?;
|
||||
|
||||
let update_path = dir.path().join("update.sst");
|
||||
database.create_index(meilidb_index_name, &schema)?;
|
||||
|
||||
let doc0 = SimpleDoc {
|
||||
id: 0,
|
||||
title: String::from("I am a title"),
|
||||
description: String::from("I am a description"),
|
||||
timestamp: 1234567,
|
||||
};
|
||||
let doc1 = SimpleDoc {
|
||||
id: 1,
|
||||
title: String::from("I am the second title"),
|
||||
description: String::from("I am the second description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let mut update = {
|
||||
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
|
||||
builder.update(0, &doc0).unwrap();
|
||||
builder.update(1, &doc1).unwrap();
|
||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
update.set_move(true);
|
||||
database.ingest_update_file(update)?;
|
||||
let view = database.view();
|
||||
|
||||
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
|
||||
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||
|
||||
assert_eq!(doc0, de_doc0);
|
||||
assert_eq!(doc1, de_doc1);
|
||||
|
||||
Ok(dir.close()?)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_two_easy_updates() -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let meilidb_path = dir.path().join("meilidb.mdb");
|
||||
let meilidb_index_name = "default";
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct SimpleDoc {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
timestamp: u64,
|
||||
}
|
||||
|
||||
let schema = {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("id", STORED);
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
builder.new_attribute("timestamp", STORED);
|
||||
builder.build()
|
||||
};
|
||||
|
||||
let database = Database::create(&meilidb_path)?;
|
||||
|
||||
database.create_index(meilidb_index_name, &schema)?;
|
||||
|
||||
let doc0 = SimpleDoc {
|
||||
id: 0,
|
||||
title: String::from("I am a title"),
|
||||
description: String::from("I am a description"),
|
||||
timestamp: 1234567,
|
||||
};
|
||||
let doc1 = SimpleDoc {
|
||||
id: 1,
|
||||
title: String::from("I am the second title"),
|
||||
description: String::from("I am the second description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
let doc2 = SimpleDoc {
|
||||
id: 2,
|
||||
title: String::from("I am the third title"),
|
||||
description: String::from("I am the third description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
let doc3 = SimpleDoc {
|
||||
id: 3,
|
||||
title: String::from("I am the fourth title"),
|
||||
description: String::from("I am the fourth description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
database.commit_update(builder)?;
|
||||
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||
|
||||
assert_eq!(doc0, de_doc0);
|
||||
assert_eq!(doc1, de_doc1);
|
||||
|
||||
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
|
||||
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
|
||||
|
||||
assert_eq!(doc2, de_doc2);
|
||||
assert_eq!(doc3, de_doc3);
|
||||
|
||||
Ok(dir.close()?)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "nightly", test))]
|
||||
mod bench {
|
||||
extern crate test;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::error::Error;
|
||||
use std::iter::repeat_with;
|
||||
use self::test::Bencher;
|
||||
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand_xorshift::XorShiftRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use serde_derive::Serialize;
|
||||
use rand::seq::SliceRandom;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::schema::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
|
||||
let mut words = String::new();
|
||||
|
||||
for i in 0..number {
|
||||
let word_len = rng.gen_range(1, 12);
|
||||
let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
|
||||
words.extend(iter);
|
||||
|
||||
if i == number - 1 { // last word
|
||||
let final_ = [".", "?", "!", "..."].choose(rng).cloned();
|
||||
words.extend(final_);
|
||||
} else {
|
||||
let middle = [",", ", "].choose(rng).cloned();
|
||||
words.extend(middle);
|
||||
}
|
||||
}
|
||||
|
||||
words
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..300 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
database.commit_update(builder)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
bench.iter(|| {
|
||||
let database = Database::open(db_path.clone()).unwrap();
|
||||
test::black_box(|| database);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..3000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
database.commit_update(builder)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
bench.iter(|| {
|
||||
let database = Database::open(db_path.clone()).unwrap();
|
||||
test::black_box(|| database);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..30_000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
database.commit_update(builder)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
bench.iter(|| {
|
||||
let database = Database::open(db_path.clone()).unwrap();
|
||||
test::black_box(|| database);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..300 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..3000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..30_000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -1,29 +1,36 @@
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::io::{Read, Write};
|
||||
use std::{fmt, u32};
|
||||
use std::path::Path;
|
||||
use std::error::Error;
|
||||
use std::{fmt, u16};
|
||||
use std::ops::BitOr;
|
||||
use std::sync::Arc;
|
||||
use std::fs::File;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
|
||||
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct SchemaProps {
|
||||
#[serde(default)]
|
||||
stored: bool,
|
||||
|
||||
#[serde(default)]
|
||||
indexed: bool,
|
||||
}
|
||||
|
||||
impl SchemaProps {
|
||||
pub fn is_stored(&self) -> bool {
|
||||
pub fn is_stored(self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
pub fn is_indexed(self) -> bool {
|
||||
self.indexed
|
||||
}
|
||||
}
|
||||
@ -39,33 +46,39 @@ impl BitOr for SchemaProps {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SchemaBuilder {
|
||||
attrs: LinkedHashMap<String, SchemaProps>,
|
||||
identifier: String,
|
||||
attributes: LinkedHashMap<String, SchemaProps>,
|
||||
}
|
||||
|
||||
impl SchemaBuilder {
|
||||
pub fn new() -> SchemaBuilder {
|
||||
SchemaBuilder { attrs: LinkedHashMap::new() }
|
||||
pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
|
||||
SchemaBuilder {
|
||||
identifier: name.into(),
|
||||
attributes: LinkedHashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
|
||||
let len = self.attrs.len();
|
||||
if self.attrs.insert(name.into(), props).is_some() {
|
||||
let len = self.attributes.len();
|
||||
if self.attributes.insert(name.into(), props).is_some() {
|
||||
panic!("Field already inserted.")
|
||||
}
|
||||
SchemaAttr(len as u32)
|
||||
SchemaAttr(len as u16)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Schema {
|
||||
let mut attrs = HashMap::new();
|
||||
let mut props = Vec::new();
|
||||
|
||||
for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
|
||||
attrs.insert(name.clone(), SchemaAttr(i as u32));
|
||||
for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
|
||||
attrs.insert(name.clone(), SchemaAttr(i as u16));
|
||||
props.push((name, prop));
|
||||
}
|
||||
|
||||
Schema { inner: Arc::new(InnerSchema { attrs, props }) }
|
||||
let identifier = self.identifier;
|
||||
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
|
||||
}
|
||||
}
|
||||
|
||||
@ -76,69 +89,124 @@ pub struct Schema {
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct InnerSchema {
|
||||
identifier: String,
|
||||
attrs: HashMap<String, SchemaAttr>,
|
||||
props: Vec<(String, SchemaProps)>,
|
||||
}
|
||||
|
||||
impl Schema {
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> bincode::Result<Schema> {
|
||||
let file = File::open(path)?;
|
||||
Schema::read_from(file)
|
||||
}
|
||||
|
||||
pub fn read_from<R: Read>(reader: R) -> bincode::Result<Schema> {
|
||||
let attrs = bincode::deserialize_from(reader)?;
|
||||
let builder = SchemaBuilder { attrs };
|
||||
pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
|
||||
let mut buffer = Vec::new();
|
||||
reader.read_to_end(&mut buffer)?;
|
||||
let builder: SchemaBuilder = toml::from_slice(&buffer)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
||||
pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
let builder = SchemaBuilder { identifier, attributes };
|
||||
|
||||
let string = toml::to_string_pretty(&builder)?;
|
||||
writer.write_all(string.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
|
||||
let mut buffer = Vec::new();
|
||||
reader.read_to_end(&mut buffer)?;
|
||||
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
let builder = SchemaBuilder { identifier, attributes };
|
||||
let string = serde_json::to_string_pretty(&builder)?;
|
||||
writer.write_all(string.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
|
||||
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
pub(crate) fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
let builder = SchemaBuilder { identifier, attributes };
|
||||
|
||||
bincode::serialize_into(writer, &builder)
|
||||
}
|
||||
|
||||
fn attributes_ordered(&self) -> LinkedHashMap<String, SchemaProps> {
|
||||
let mut ordered = BTreeMap::new();
|
||||
for (name, field) in &self.inner.attrs {
|
||||
let index = field.as_u32();
|
||||
let (_, props) = self.inner.props[index as usize];
|
||||
ordered.insert(index, (name, props));
|
||||
for (name, attr) in &self.inner.attrs {
|
||||
let (_, props) = self.inner.props[attr.0 as usize];
|
||||
ordered.insert(attr.0, (name, props));
|
||||
}
|
||||
|
||||
let mut attrs = LinkedHashMap::with_capacity(ordered.len());
|
||||
let mut attributes = LinkedHashMap::with_capacity(ordered.len());
|
||||
for (_, (name, props)) in ordered {
|
||||
attrs.insert(name, props);
|
||||
attributes.insert(name.clone(), props);
|
||||
}
|
||||
|
||||
bincode::serialize_into(writer, &attrs)
|
||||
attributes
|
||||
}
|
||||
|
||||
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let id_attribute_name = &self.inner.identifier;
|
||||
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
||||
document.serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
|
||||
let index = attr.as_u32();
|
||||
let (_, props) = self.inner.props[index as usize];
|
||||
let (_, props) = self.inner.props[attr.0 as usize];
|
||||
props
|
||||
}
|
||||
|
||||
pub fn identifier_name(&self) -> &str {
|
||||
&self.inner.identifier
|
||||
}
|
||||
|
||||
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
|
||||
self.inner.attrs.get(name.as_ref()).cloned()
|
||||
}
|
||||
|
||||
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
|
||||
let index = attr.as_u32();
|
||||
let (name, _) = &self.inner.props[index as usize];
|
||||
let (name, _) = &self.inner.props[attr.0 as usize];
|
||||
name
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
|
||||
pub struct SchemaAttr(u32);
|
||||
pub struct SchemaAttr(pub(crate) u16);
|
||||
|
||||
impl SchemaAttr {
|
||||
pub fn new(value: u32) -> SchemaAttr {
|
||||
pub fn new(value: u16) -> SchemaAttr {
|
||||
SchemaAttr(value)
|
||||
}
|
||||
|
||||
pub fn max() -> SchemaAttr {
|
||||
SchemaAttr(u32::MAX)
|
||||
pub fn min() -> SchemaAttr {
|
||||
SchemaAttr(0)
|
||||
}
|
||||
|
||||
pub fn as_u32(&self) -> u32 {
|
||||
self.0
|
||||
pub fn next(self) -> Option<SchemaAttr> {
|
||||
self.0.checked_add(1).map(SchemaAttr)
|
||||
}
|
||||
|
||||
pub fn prev(self) -> Option<SchemaAttr> {
|
||||
self.0.checked_sub(1).map(SchemaAttr)
|
||||
}
|
||||
|
||||
pub fn max() -> SchemaAttr {
|
||||
SchemaAttr(u16::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
@ -151,22 +219,92 @@ impl fmt::Display for SchemaAttr {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> bincode::Result<()> {
|
||||
let mut builder = SchemaBuilder::new();
|
||||
builder.new_attribute("alphabet", STORED);
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", STORED);
|
||||
builder.new_attribute("beta", STORED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
schema.write_to(&mut buffer)?;
|
||||
let schema2 = Schema::read_from(buffer.as_slice())?;
|
||||
schema.write_to_bin(&mut buffer)?;
|
||||
let schema2 = Schema::read_from_bin(buffer.as_slice())?;
|
||||
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize_toml() -> Result<(), Box<Error>> {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", STORED);
|
||||
builder.new_attribute("beta", STORED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
schema.to_toml(&mut buffer)?;
|
||||
|
||||
let schema2 = Schema::from_toml(buffer.as_slice())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
let data = r#"
|
||||
identifier = "id"
|
||||
|
||||
[attributes."alpha"]
|
||||
stored = true
|
||||
|
||||
[attributes."beta"]
|
||||
stored = true
|
||||
indexed = true
|
||||
|
||||
[attributes."gamma"]
|
||||
indexed = true
|
||||
"#;
|
||||
let schema2 = Schema::from_toml(data.as_bytes())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize_json() -> Result<(), Box<Error>> {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", STORED);
|
||||
builder.new_attribute("beta", STORED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
schema.to_json(&mut buffer)?;
|
||||
|
||||
let schema2 = Schema::from_json(buffer.as_slice())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
let data = r#"
|
||||
{
|
||||
"identifier": "id",
|
||||
"attributes": {
|
||||
"alpha": {
|
||||
"stored": true
|
||||
},
|
||||
"beta": {
|
||||
"stored": true,
|
||||
"indexed": true
|
||||
},
|
||||
"gamma": {
|
||||
"indexed": true
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
let schema2 = Schema::from_json(data.as_bytes())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
243
src/database/serde/find_id.rs
Normal file
243
src/database/serde/find_id.rs
Normal file
@ -0,0 +1,243 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::serde::{SerializerError, calculate_hash};
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct FindDocumentIdSerializer<'a> {
|
||||
pub id_attribute_name: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = FindDocumentIdMapSerializer<'a>;
|
||||
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(FindDocumentIdMapSerializer {
|
||||
id_attribute_name: self.id_attribute_name,
|
||||
document_id: None,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(FindDocumentIdStructSerializer {
|
||||
id_attribute_name: self.id_attribute_name,
|
||||
document_id: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FindDocumentIdMapSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
|
||||
if self.id_attribute_name == key {
|
||||
// TODO is it possible to have multiple ids?
|
||||
let id = bincode::serialize(value).unwrap();
|
||||
let hash = calculate_hash(&id);
|
||||
self.document_id = Some(DocumentId(hash));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
match self.document_id {
|
||||
Some(document_id) => Ok(document_id),
|
||||
None => Err(SerializerError::DocumentIdNotFound)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FindDocumentIdStructSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if self.id_attribute_name == key {
|
||||
// TODO can it be possible to have multiple ids?
|
||||
let id = bincode::serialize(value).unwrap();
|
||||
let hash = calculate_hash(&id);
|
||||
self.document_id = Some(DocumentId(hash));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
match self.document_id {
|
||||
Some(document_id) => Ok(document_id),
|
||||
None => Err(SerializerError::DocumentIdNotFound)
|
||||
}
|
||||
}
|
||||
}
|
191
src/database/serde/indexer_serializer.rs
Normal file
191
src/database/serde/indexer_serializer.rs
Normal file
@ -0,0 +1,191 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::Token;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
|
||||
pub struct IndexerSerializer<'a, 'b, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for token in self.tokenizer_builder.build(v) {
|
||||
let Token { word, word_index, char_index } = token;
|
||||
let document_id = self.document_id;
|
||||
|
||||
// FIXME must u32::try_from instead
|
||||
let attribute = self.attribute.0;
|
||||
let word_index = word_index as u32;
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
let length = word.chars().count() as u16;
|
||||
|
||||
if self.stop_words.contains(&word_lower) { continue }
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
let char_index = char_index as u32;
|
||||
let char_length = length;
|
||||
|
||||
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index)?;
|
||||
}
|
||||
|
||||
let char_index = char_index as u32;
|
||||
let char_length = length;
|
||||
|
||||
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||
self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "seq" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
146
src/database/serde/key_to_string.rs
Normal file
146
src/database/serde/key_to_string.rs
Normal file
@ -0,0 +1,146 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::SerializerError;
|
||||
|
||||
pub struct KeyToStringSerializer;
|
||||
|
||||
impl ser::Serializer for KeyToStringSerializer {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
64
src/database/serde/mod.rs
Normal file
64
src/database/serde/mod.rs
Normal file
@ -0,0 +1,64 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use serde::ser;
|
||||
|
||||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
pub mod find_id;
|
||||
pub mod key_to_string;
|
||||
pub mod serializer;
|
||||
pub mod indexer_serializer;
|
||||
pub mod deserializer;
|
||||
|
||||
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s = DefaultHasher::new();
|
||||
t.hash(&mut s);
|
||||
s.finish()
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
DocumentIdNotFound,
|
||||
UnserializableType { name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::DocumentIdNotFound => {
|
||||
write!(f, "serialized document does not have an id according to the schema")
|
||||
}
|
||||
SerializerError::UnserializableType { name } => {
|
||||
write!(f, "Only struct and map types are considered valid documents and
|
||||
can be serialized, not {} types directly.", name)
|
||||
},
|
||||
SerializerError::Custom(s) => f.write_str(&s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
impl From<String> for SerializerError {
|
||||
fn from(value: String) -> SerializerError {
|
||||
SerializerError::Custom(value)
|
||||
}
|
||||
}
|
287
src/database/serde/serializer.rs
Normal file
287
src/database/serde/serializer.rs
Normal file
@ -0,0 +1,287 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, 'b, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a, 'b, B>;
|
||||
type SerializeStruct = StructSerializer<'a, 'b, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(MapSerializer {
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
update: self.update,
|
||||
document_id: self.document_id,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a, 'b, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
pub current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
|
||||
if let Some(attr) = self.schema.attribute(key) {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
self.update.insert_attribute_value(attr, &value)?;
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a, 'b, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if let Some(attr) = self.schema.attribute(key) {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
self.update.insert_attribute_value(attr, &value)?;
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
198
src/database/update.rs
Normal file
198
src/database/update.rs
Normal file
@ -0,0 +1,198 @@
|
||||
use std::collections::{HashSet, BTreeMap};
|
||||
use std::error::Error;
|
||||
|
||||
use rocksdb::rocksdb::{Writable, WriteBatch};
|
||||
use hashbrown::hash_map::HashMap;
|
||||
use serde::Serialize;
|
||||
use fst::map::Map;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Positive, PositiveBuilder, Negative};
|
||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::database::schema::Schema;
|
||||
use crate::database::index::Index;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use crate::database::DATA_INDEX;
|
||||
|
||||
pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
|
||||
pub struct Update {
|
||||
schema: Schema,
|
||||
raw_builder: RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl Update {
|
||||
pub(crate) fn new(schema: Schema) -> Update {
|
||||
Update { schema, raw_builder: RawUpdateBuilder::new() }
|
||||
}
|
||||
|
||||
pub fn update_document<T, B>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
update: &mut self.raw_builder.document_update(document_id)?,
|
||||
stop_words: stop_words,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
self.raw_builder.document_update(document_id)?.remove()?;
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> Result<WriteBatch, Box<Error>> {
|
||||
self.raw_builder.build()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
enum UpdateType {
|
||||
Updated,
|
||||
Deleted,
|
||||
}
|
||||
|
||||
use UpdateType::{Updated, Deleted};
|
||||
|
||||
pub struct RawUpdateBuilder {
|
||||
documents_update: HashMap<DocumentId, UpdateType>,
|
||||
indexed_words: BTreeMap<Token, Vec<DocIndex>>,
|
||||
batch: WriteBatch,
|
||||
}
|
||||
|
||||
impl RawUpdateBuilder {
|
||||
pub fn new() -> RawUpdateBuilder {
|
||||
RawUpdateBuilder {
|
||||
documents_update: HashMap::new(),
|
||||
indexed_words: BTreeMap::new(),
|
||||
batch: WriteBatch::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_update(&mut self, document_id: DocumentId) -> Result<DocumentUpdate, SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
match self.documents_update.get(&document_id) {
|
||||
Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }),
|
||||
Some(Updated) => Err(SerializerError::custom(
|
||||
"This document has already been removed and cannot be updated in the same update"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<WriteBatch, Box<Error>> {
|
||||
let negative = {
|
||||
let mut removed_document_ids = Vec::new();
|
||||
for (id, update_type) in self.documents_update {
|
||||
if update_type == Deleted {
|
||||
removed_document_ids.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
removed_document_ids.sort_unstable();
|
||||
let removed_document_ids = Set::new_unchecked(&removed_document_ids);
|
||||
let doc_ids = DocIds::new(removed_document_ids);
|
||||
|
||||
Negative::new(doc_ids)
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut positive_builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in self.indexed_words {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
positive_builder.insert(key, indexes)?;
|
||||
}
|
||||
|
||||
let (map, indexes) = positive_builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let index = Index { negative, positive };
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
index.write_to_bytes(&mut bytes);
|
||||
self.batch.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
Ok(self.batch)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentUpdate<'a> {
|
||||
document_id: DocumentId,
|
||||
inner: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a> DocumentUpdate<'a> {
|
||||
pub fn remove(&mut self) -> Result<(), SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) {
|
||||
return Err(SerializerError::custom(
|
||||
"This document has already been updated and cannot be removed in the same update"
|
||||
));
|
||||
}
|
||||
|
||||
let start = DocumentKey::new(self.document_id).with_attribute_min();
|
||||
let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1
|
||||
self.inner.batch.delete_range(start.as_ref(), end.as_ref())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||
return Err(SerializerError::custom(
|
||||
"This document has already been deleted and cannot be updated in the same update"
|
||||
));
|
||||
}
|
||||
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.inner.batch.put(key.as_ref(), &value)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||
return Err(SerializerError::custom(
|
||||
"This document has already been deleted and cannot be updated in the same update"
|
||||
));
|
||||
}
|
||||
|
||||
self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
mod negative;
|
||||
mod positive;
|
||||
|
||||
pub use self::positive::{PositiveUpdateBuilder, NewState};
|
||||
pub use self::negative::NegativeUpdateBuilder;
|
||||
|
||||
pub struct Update {
|
||||
path: PathBuf,
|
||||
can_be_moved: bool,
|
||||
}
|
||||
|
||||
impl Update {
|
||||
pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
|
||||
Ok(Update { path: path.into(), can_be_moved: false })
|
||||
}
|
||||
|
||||
pub fn open_and_move<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
|
||||
Ok(Update { path: path.into(), can_be_moved: true })
|
||||
}
|
||||
|
||||
pub fn set_move(&mut self, can_be_moved: bool) {
|
||||
self.can_be_moved = can_be_moved
|
||||
}
|
||||
|
||||
pub fn can_be_moved(&self) -> bool {
|
||||
self.can_be_moved
|
||||
}
|
||||
|
||||
pub fn into_path_buf(self) -> PathBuf {
|
||||
self.path
|
||||
}
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
mod update;
|
||||
mod unordered_builder;
|
||||
|
||||
pub use self::update::NegativeUpdateBuilder;
|
@ -1,37 +0,0 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt};
|
||||
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct UnorderedNegativeBlobBuilder<W> {
|
||||
doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
|
||||
wrt: W,
|
||||
}
|
||||
|
||||
impl UnorderedNegativeBlobBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
UnorderedNegativeBlobBuilder::new(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: io::Write> UnorderedNegativeBlobBuilder<W> {
|
||||
pub fn new(wrt: W) -> Self {
|
||||
Self {
|
||||
doc_ids: BTreeSet::new(),
|
||||
wrt: wrt,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, doc: DocumentId) -> bool {
|
||||
self.doc_ids.insert(doc)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
for id in self.doc_ids {
|
||||
self.wrt.write_u64::<NativeEndian>(id)?;
|
||||
}
|
||||
Ok(self.wrt)
|
||||
}
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use ::rocksdb::rocksdb_options;
|
||||
|
||||
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
|
||||
use crate::database::blob::{Blob, NegativeBlob};
|
||||
use crate::database::update::Update;
|
||||
use crate::database::DocumentKey;
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct NegativeUpdateBuilder {
|
||||
path: PathBuf,
|
||||
doc_ids: UnorderedNegativeBlobBuilder<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl NegativeUpdateBuilder {
|
||||
pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
|
||||
NegativeUpdateBuilder {
|
||||
path: path.into(),
|
||||
doc_ids: UnorderedNegativeBlobBuilder::memory(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, id: DocumentId) -> bool {
|
||||
self.doc_ids.insert(id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.path.to_string_lossy())?;
|
||||
|
||||
let bytes = self.doc_ids.into_inner()?;
|
||||
let negative_blob = NegativeBlob::from_bytes(bytes)?;
|
||||
let blob = Blob::Negative(negative_blob);
|
||||
|
||||
// write the data-index aka negative blob
|
||||
let bytes = bincode::serialize(&blob)?;
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// FIXME remove this ugly thing !
|
||||
// let Blob::Negative(negative_blob) = blob;
|
||||
let negative_blob = match blob {
|
||||
Blob::Negative(blob) => blob,
|
||||
Blob::Positive(_) => unreachable!(),
|
||||
};
|
||||
|
||||
for &document_id in negative_blob.as_ref().as_slice() {
|
||||
let start = DocumentKey::new(document_id);
|
||||
let end = start.with_attribute_max();
|
||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
Update::open(self.path)
|
||||
}
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
mod update;
|
||||
mod unordered_builder;
|
||||
|
||||
pub use self::update::{PositiveUpdateBuilder, NewState};
|
@ -1,49 +0,0 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::error::Error;
|
||||
use std::io::Write;
|
||||
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::blob::positive::PositiveBlobBuilder;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct UnorderedPositiveBlobBuilder<W, X> {
|
||||
builder: PositiveBlobBuilder<W, X>,
|
||||
map: BTreeMap<Vec<u8>, Vec<DocIndex>>,
|
||||
}
|
||||
|
||||
impl UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
Self {
|
||||
builder: PositiveBlobBuilder::memory(),
|
||||
map: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> UnorderedPositiveBlobBuilder<W, X> {
|
||||
pub fn new(map_wtr: W, doc_wtr: X) -> Result<Self, Box<Error>> {
|
||||
Ok(UnorderedPositiveBlobBuilder {
|
||||
builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?,
|
||||
map: BTreeMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn insert<K: Into<Vec<u8>>>(&mut self, input: K, doc_index: DocIndex) {
|
||||
self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> Result<(W, X), Box<Error>> {
|
||||
for (key, mut doc_indexes) in self.map {
|
||||
doc_indexes.sort_unstable();
|
||||
self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?;
|
||||
}
|
||||
self.builder.into_inner()
|
||||
}
|
||||
}
|
@ -1,514 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use ::rocksdb::rocksdb_options;
|
||||
use serde::ser::{self, Serialize};
|
||||
|
||||
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::DocumentKeyAttr;
|
||||
use crate::database::update::Update;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::database::blob::Blob;
|
||||
|
||||
pub enum NewState {
|
||||
Updated { value: Vec<u8> },
|
||||
Removed,
|
||||
}
|
||||
|
||||
pub struct PositiveUpdateBuilder<B> {
|
||||
path: PathBuf,
|
||||
schema: Schema,
|
||||
tokenizer_builder: B,
|
||||
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B> {
|
||||
pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
|
||||
PositiveUpdateBuilder {
|
||||
path: path.into(),
|
||||
schema: schema,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
builder: UnorderedPositiveBlobBuilder::memory(),
|
||||
new_states: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: id,
|
||||
tokenizer_builder: &self.tokenizer_builder,
|
||||
builder: &mut self.builder,
|
||||
new_states: &mut self.new_states
|
||||
};
|
||||
|
||||
Ok(ser::Serialize::serialize(document, serializer)?)
|
||||
}
|
||||
|
||||
// TODO value must be a field that can be indexed
|
||||
pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) {
|
||||
let value = bincode::serialize(&value).unwrap();
|
||||
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value });
|
||||
}
|
||||
|
||||
pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) {
|
||||
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
SchemaDontMatch { attribute: String },
|
||||
UnserializableType { name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::SchemaDontMatch { attribute } => {
|
||||
write!(f, "serialized document try to specify the \
|
||||
{:?} attribute that is not known by the schema", attribute)
|
||||
},
|
||||
SerializerError::UnserializableType { name } => {
|
||||
write!(f, "Only struct and map types are considered valid documents and
|
||||
can be serialized, not {} types directly.", name)
|
||||
},
|
||||
SerializerError::Custom(s) => f.write_str(&s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
struct Serializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = StructSerializer<'a, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
// Ok(MapSerializer {
|
||||
// schema: self.schema,
|
||||
// document_id: self.document_id,
|
||||
// new_states: self.new_states,
|
||||
// })
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
struct StructSerializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
match self.schema.attribute(key) {
|
||||
Some(attr) => {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.new_states.insert(key, NewState::Updated { value });
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
builder: self.builder,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
|
||||
}
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct IndexerSerializer<'a, B> {
|
||||
tokenizer_builder: &'a B,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for (index, word) in self.tokenizer_builder.build(v) {
|
||||
let doc_index = DocIndex {
|
||||
document_id: self.document_id,
|
||||
attribute: self.attribute.as_u32() as u8,
|
||||
attribute_index: index as u32,
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
self.builder.insert(word_unidecoded, doc_index);
|
||||
}
|
||||
|
||||
self.builder.insert(word_lower, doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "seq" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B> {
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.path.to_string_lossy())?;
|
||||
|
||||
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
|
||||
let blob = Blob::Positive(positive_blob);
|
||||
|
||||
// write the data-index aka positive blob
|
||||
let bytes = bincode::serialize(&blob)?;
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents fields updates
|
||||
for (key, state) in self.new_states {
|
||||
match state {
|
||||
NewState::Updated { value } => {
|
||||
file_writer.put(key.as_ref(), &value)?
|
||||
},
|
||||
NewState::Removed => file_writer.delete(key.as_ref())?,
|
||||
}
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
Update::open(self.path)
|
||||
}
|
||||
}
|
@ -9,17 +9,17 @@ use serde::de::DeserializeOwned;
|
||||
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::{retrieve_data_schema, retrieve_data_index};
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::deserializer::Deserializer;
|
||||
use crate::database::serde::deserializer::Deserializer;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::rank::QueryBuilder;
|
||||
use crate::database::index::Index;
|
||||
use crate::rank::{QueryBuilder, FilterFunc};
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct DatabaseView<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
snapshot: Snapshot<D>,
|
||||
blob: PositiveBlob,
|
||||
index: Index,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@ -28,16 +28,16 @@ where D: Deref<Target=DB>
|
||||
{
|
||||
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
|
||||
let schema = retrieve_data_schema(&snapshot)?;
|
||||
let blob = retrieve_data_index(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, blob, schema })
|
||||
let index = retrieve_data_index(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, index, schema })
|
||||
}
|
||||
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
pub fn blob(&self) -> &PositiveBlob {
|
||||
&self.blob
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
pub fn into_snapshot(self) -> Snapshot<D> {
|
||||
@ -71,19 +71,18 @@ where D: Deref<Target=DB>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn query_builder(&self) -> Result<QueryBuilder<D>, Box<Error>> {
|
||||
pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
|
||||
QueryBuilder::new(self)
|
||||
}
|
||||
|
||||
// TODO create an enum error type
|
||||
pub fn retrieve_document<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
|
||||
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
|
||||
where T: DeserializeOwned
|
||||
{
|
||||
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
|
||||
Ok(T::deserialize(&mut deserializer)?)
|
||||
}
|
||||
|
||||
pub fn retrieve_documents<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
|
||||
pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
|
||||
where T: DeserializeOwned,
|
||||
I: IntoIterator<Item=DocumentId>,
|
||||
{
|
||||
@ -100,7 +99,7 @@ where D: Deref<Target=DB>
|
||||
{
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let mut options = ReadOptions::new();
|
||||
let lower = DocumentKey::new(0);
|
||||
let lower = DocumentKey::new(DocumentId(0));
|
||||
options.set_iterate_lower_bound(lower.as_ref());
|
||||
|
||||
let mut iter = self.snapshot.iter_opt(options);
|
||||
@ -149,7 +148,7 @@ where D: Deref<Target=DB>,
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.document_ids.next() {
|
||||
Some(id) => Some(self.database_view.retrieve_document(id)),
|
||||
Some(id) => Some(self.database_view.document_by_id(id)),
|
||||
None => None
|
||||
}
|
||||
}
|
||||
@ -168,7 +167,7 @@ where D: Deref<Target=DB>,
|
||||
{
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
match self.document_ids.next_back() {
|
||||
Some(id) => Some(self.database_view.retrieve_document(id)),
|
||||
Some(id) => Some(self.database_view.document_by_id(id)),
|
||||
None => None
|
||||
}
|
||||
}
|
80
src/lib.rs
80
src/lib.rs
@ -1,9 +1,10 @@
|
||||
#![cfg_attr(feature = "nightly", feature(test))]
|
||||
|
||||
pub mod automaton;
|
||||
pub mod database;
|
||||
pub mod data;
|
||||
pub mod rank;
|
||||
pub mod tokenizer;
|
||||
pub mod vec_read_only;
|
||||
mod common_words;
|
||||
|
||||
pub use rocksdb;
|
||||
@ -11,30 +12,36 @@ pub use rocksdb;
|
||||
pub use self::tokenizer::Tokenizer;
|
||||
pub use self::common_words::CommonWords;
|
||||
|
||||
pub type DocumentId = u64;
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub struct DocumentId(u64);
|
||||
|
||||
/// This structure represent the position of a word
|
||||
/// in a document and its attributes.
|
||||
///
|
||||
/// This is stored in the map, generated at index time,
|
||||
/// extracted and interpreted at search time.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[repr(C)]
|
||||
pub struct DocIndex {
|
||||
/// The document identifier where the word was found.
|
||||
pub document_id: DocumentId,
|
||||
|
||||
/// The attribute identifier in the document
|
||||
/// where the word was found.
|
||||
///
|
||||
/// This is an `u8` therefore a document
|
||||
/// can not have more than `2^8` attributes.
|
||||
pub attribute: u8,
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u32,
|
||||
|
||||
/// The index where the word was found in the attribute.
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// Only the first 1000 words are indexed.
|
||||
pub attribute_index: u32,
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u32,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
/// This structure represent a matching word with informations
|
||||
@ -45,7 +52,7 @@ pub struct DocIndex {
|
||||
///
|
||||
/// The word in itself is not important.
|
||||
// TODO do data oriented programming ? very arrays ?
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Match {
|
||||
/// The word index in the query sentence.
|
||||
/// Same as the `attribute_index` but for the query words.
|
||||
@ -57,23 +64,21 @@ pub struct Match {
|
||||
/// (i.e. the Levenshtein distance).
|
||||
pub distance: u8,
|
||||
|
||||
/// The attribute in which the word is located
|
||||
/// (i.e. Title is 0, Description is 1).
|
||||
///
|
||||
/// This is an `u8` therefore a document
|
||||
/// can not have more than `2^8` attributes.
|
||||
pub attribute: u8,
|
||||
|
||||
/// Where does this word is located in the attribute string
|
||||
/// (i.e. at the start or the end of the attribute).
|
||||
///
|
||||
/// The index in the attribute is limited to a maximum of `2^32`
|
||||
/// this is because we index only the first 1000 words
|
||||
/// in an attribute.
|
||||
pub attribute_index: u32,
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u32,
|
||||
|
||||
/// Whether the word that match is an exact match or a prefix.
|
||||
pub is_exact: bool,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u32,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
@ -82,8 +87,10 @@ impl Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: 0,
|
||||
attribute_index: 0,
|
||||
word_index: 0,
|
||||
is_exact: false,
|
||||
char_index: 0,
|
||||
char_length: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@ -91,9 +98,22 @@ impl Match {
|
||||
Match {
|
||||
query_index: u32::max_value(),
|
||||
distance: u8::max_value(),
|
||||
attribute: u8::max_value(),
|
||||
attribute_index: u32::max_value(),
|
||||
attribute: u16::max_value(),
|
||||
word_index: u32::max_value(),
|
||||
is_exact: true,
|
||||
char_index: u32::max_value(),
|
||||
char_length: u16::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
#[test]
|
||||
fn docindex_mem_size() {
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
||||
}
|
||||
}
|
||||
|
@ -1,19 +1,13 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::Document;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct DocumentId;
|
||||
|
||||
impl<D> Criterion<D> for DocumentId
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
impl Criterion for DocumentId {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
lhs.id.cmp(&rhs.id)
|
||||
}
|
||||
}
|
||||
|
@ -1,33 +1,40 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn contains_exact(matches: &[Match]) -> bool {
|
||||
matches.iter().any(|m| m.is_exact)
|
||||
}
|
||||
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
||||
let mut count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(matches: &[Match]) -> usize {
|
||||
GroupBy::new(matches, match_query_index).map(contains_exact).count()
|
||||
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||
let len = group.len();
|
||||
count += is_exact[index..index + len].contains(&true) as usize;
|
||||
index += len;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Exact;
|
||||
|
||||
impl<D> Criterion<D> for Exact
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = number_exact_matches(&lhs.matches);
|
||||
let rhs = number_exact_matches(&rhs.matches);
|
||||
impl Criterion for Exact {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let is_exact = lhs.is_exact();
|
||||
number_exact_matches(query_index, is_exact)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let is_exact = rhs.is_exact();
|
||||
number_exact_matches(query_index, is_exact)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
@ -8,12 +8,7 @@ mod sort_by;
|
||||
mod document_id;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::Document;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
pub use self::{
|
||||
sum_of_typos::SumOfTypos,
|
||||
@ -26,56 +21,47 @@ pub use self::{
|
||||
document_id::DocumentId,
|
||||
};
|
||||
|
||||
pub trait Criterion<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
#[inline]
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
|
||||
pub trait Criterion: Send + Sync {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||
|
||||
#[inline]
|
||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
||||
self.evaluate(lhs, rhs, view) == Ordering::Equal
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
self.evaluate(lhs, rhs) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D, T: Criterion<D> + ?Sized> Criterion<D> for &'a T
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs, view)
|
||||
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
||||
(**self).eq(lhs, rhs, view)
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl<D, T: Criterion<D> + ?Sized> Criterion<D> for Box<T>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs, view)
|
||||
impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
||||
(**self).eq(lhs, rhs, view)
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CriteriaBuilder<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
inner: Vec<Box<dyn Criterion<D>>>
|
||||
#[derive(Default)]
|
||||
pub struct CriteriaBuilder {
|
||||
inner: Vec<Box<dyn Criterion>>
|
||||
}
|
||||
|
||||
impl<D> CriteriaBuilder<D>
|
||||
where D: Deref<Target=DB>
|
||||
impl CriteriaBuilder
|
||||
{
|
||||
pub fn new() -> CriteriaBuilder<D> {
|
||||
pub fn new() -> CriteriaBuilder {
|
||||
CriteriaBuilder { inner: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<D> {
|
||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder {
|
||||
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
|
||||
}
|
||||
|
||||
@ -83,33 +69,29 @@ where D: Deref<Target=DB>
|
||||
self.inner.reserve(additional)
|
||||
}
|
||||
|
||||
pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder<D>
|
||||
where C: 'static + Criterion<D>,
|
||||
pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder
|
||||
where C: 'static + Criterion,
|
||||
{
|
||||
self.push(criterion);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push<C>(&mut self, criterion: C)
|
||||
where C: 'static + Criterion<D>,
|
||||
where C: 'static + Criterion,
|
||||
{
|
||||
self.inner.push(Box::new(criterion));
|
||||
}
|
||||
|
||||
pub fn build(self) -> Criteria<D> {
|
||||
pub fn build(self) -> Criteria {
|
||||
Criteria { inner: self.inner }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Criteria<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
inner: Vec<Box<dyn Criterion<D>>>,
|
||||
pub struct Criteria {
|
||||
inner: Vec<Box<dyn Criterion>>,
|
||||
}
|
||||
|
||||
impl<D> Default for Criteria<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
impl Default for Criteria {
|
||||
fn default() -> Self {
|
||||
CriteriaBuilder::with_capacity(7)
|
||||
.add(SumOfTypos)
|
||||
@ -123,10 +105,8 @@ where D: Deref<Target=DB>
|
||||
}
|
||||
}
|
||||
|
||||
impl<D> AsRef<[Box<dyn Criterion<D>>]> for Criteria<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn as_ref(&self) -> &[Box<dyn Criterion<D>>] {
|
||||
impl AsRef<[Box<dyn Criterion>]> for Criteria {
|
||||
fn as_ref(&self) -> &[Box<dyn Criterion>] {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
@ -1,28 +1,28 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
||||
GroupBy::new(matches, match_query_index).count()
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||
query_index.linear_group_by(PartialEq::eq).count()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct NumberOfWords;
|
||||
|
||||
impl<D> Criterion<D> for NumberOfWords
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = number_of_query_words(&lhs.matches);
|
||||
let rhs = number_of_query_words(&rhs.matches);
|
||||
impl Criterion for NumberOfWords {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ use serde::de::DeserializeOwned;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::Document;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
@ -24,7 +24,7 @@ use crate::rank::Document;
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no-test
|
||||
/// ```ignore
|
||||
/// use serde_derive::Deserialize;
|
||||
/// use meilidb::rank::criterion::*;
|
||||
///
|
||||
@ -40,34 +40,40 @@ use crate::rank::Document;
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(SortBy::<TimeOnly>::new())
|
||||
/// .add(SortBy::<TimeOnly>::new(&view))
|
||||
/// .add(DocumentId);
|
||||
///
|
||||
/// let criterion = builder.build();
|
||||
///
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct SortBy<T> {
|
||||
pub struct SortBy<'a, T, D>
|
||||
where D: Deref<Target=DB> + Send + Sync,
|
||||
T: Send + Sync
|
||||
{
|
||||
view: &'a DatabaseView<D>,
|
||||
_phantom: marker::PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T> SortBy<T> {
|
||||
pub fn new() -> Self {
|
||||
SortBy { _phantom: marker::PhantomData }
|
||||
impl<'a, T, D> SortBy<'a, T, D>
|
||||
where D: Deref<Target=DB> + Send + Sync,
|
||||
T: Send + Sync
|
||||
{
|
||||
pub fn new(view: &'a DatabaseView<D>) -> Self {
|
||||
SortBy { view, _phantom: marker::PhantomData }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, D> Criterion<D> for SortBy<T>
|
||||
where D: Deref<Target=DB>,
|
||||
T: DeserializeOwned + Ord,
|
||||
impl<'a, T, D> Criterion for SortBy<'a, T, D>
|
||||
where D: Deref<Target=DB> + Send + Sync,
|
||||
T: DeserializeOwned + Ord + Send + Sync,
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = match view.retrieve_document::<T>(lhs.id) {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = match self.view.document_by_id::<T>(lhs.id) {
|
||||
Ok(doc) => Some(doc),
|
||||
Err(e) => { eprintln!("{}", e); None },
|
||||
};
|
||||
|
||||
let rhs = match view.retrieve_document::<T>(rhs.id) {
|
||||
let rhs = match self.view.document_by_id::<T>(rhs.id) {
|
||||
Ok(doc) => Some(doc),
|
||||
Err(e) => { eprintln!("{}", e); None },
|
||||
};
|
||||
|
@ -1,25 +1,20 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(matches: &[Match]) -> i8 {
|
||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> isize {
|
||||
let mut sum_typos = 0;
|
||||
let mut number_words = 0;
|
||||
let mut index = 0;
|
||||
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
for group in GroupBy::new(matches, match_query_index) {
|
||||
sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
|
||||
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||
sum_typos += distance[index] as isize;
|
||||
number_words += 1;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_typos - number_words
|
||||
@ -28,18 +23,24 @@ fn sum_matches_typos(matches: &[Match]) -> i8 {
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfTypos;
|
||||
|
||||
impl<D> Criterion<D> for SumOfTypos
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = sum_matches_typos(&lhs.matches);
|
||||
let rhs = sum_matches_typos(&rhs.matches);
|
||||
impl Criterion for SumOfTypos {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@ -50,30 +51,14 @@ mod tests {
|
||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||
#[test]
|
||||
fn one_typo_reference() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
id: 0,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
id: 1,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index1 = &[0, 1];
|
||||
let distance1 = &[1, 0];
|
||||
|
||||
let lhs = sum_matches_typos(&doc0.matches);
|
||||
let rhs = sum_matches_typos(&doc1.matches);
|
||||
let lhs = sum_matches_typos(query_index0, distance0);
|
||||
let rhs = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
||||
}
|
||||
|
||||
@ -83,29 +68,14 @@ mod tests {
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn no_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
id: 0,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
id: 1,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let lhs = sum_matches_typos(&doc0.matches);
|
||||
let rhs = sum_matches_typos(&doc1.matches);
|
||||
let lhs = sum_matches_typos(query_index0, distance0);
|
||||
let rhs = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
||||
}
|
||||
|
||||
@ -115,29 +85,14 @@ mod tests {
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn one_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
id: 0,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 1];
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
id: 1,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let lhs = sum_matches_typos(&doc0.matches);
|
||||
let rhs = sum_matches_typos(&doc1.matches);
|
||||
let lhs = sum_matches_typos(query_index0, distance0);
|
||||
let rhs = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
|
||||
}
|
||||
}
|
||||
|
@ -1,32 +1,39 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(matches: &[Match]) -> u8 {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| unsafe {
|
||||
group.get_unchecked(0).attribute
|
||||
}).sum()
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||
let mut sum_attributes = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||
sum_attributes += attribute[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_attributes
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsAttribute;
|
||||
|
||||
impl<D> Criterion<D> for SumOfWordsAttribute
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = sum_matches_attributes(&lhs.matches);
|
||||
let rhs = sum_matches_attributes(&rhs.matches);
|
||||
impl Criterion for SumOfWordsAttribute {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let attribute = lhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let attribute = rhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
@ -1,32 +1,39 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| unsafe {
|
||||
group.get_unchecked(0).attribute_index
|
||||
}).sum()
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
|
||||
let mut sum_word_index = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group_by(PartialEq::eq) {
|
||||
sum_word_index += word_index[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_word_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsPosition;
|
||||
|
||||
impl<D> Criterion<D> for SumOfWordsPosition
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = sum_matches_attribute_index(&lhs.matches);
|
||||
let rhs = sum_matches_attribute_index(&rhs.matches);
|
||||
impl Criterion for SumOfWordsPosition {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let word_index = lhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let word_index = rhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
@ -1,16 +1,17 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
const MAX_DISTANCE: u32 = 8;
|
||||
|
||||
#[inline]
|
||||
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||
(a.clone(), b.clone())
|
||||
}
|
||||
|
||||
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
@ -19,30 +20,48 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
|
||||
if lhs.attribute != rhs.attribute { return MAX_DISTANCE }
|
||||
index_proximity(lhs.attribute_index, rhs.attribute_index)
|
||||
fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
|
||||
if lattr != rattr { return MAX_DISTANCE }
|
||||
index_proximity(lwi, rwi)
|
||||
}
|
||||
|
||||
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
|
||||
fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
|
||||
let mut min_prox = u32::max_value();
|
||||
for a in lhs {
|
||||
for b in rhs {
|
||||
for a in lattr.iter().zip(lwi) {
|
||||
for b in rattr.iter().zip(rwi) {
|
||||
let a = clone_tuple(a);
|
||||
let b = clone_tuple(b);
|
||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||
}
|
||||
}
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(matches: &[Match]) -> u32 {
|
||||
fn matches_proximity(query_index: &[u32], attribute: &[u16], word_index: &[u32]) -> u32 {
|
||||
let mut proximity = 0;
|
||||
let mut iter = GroupBy::new(matches, match_query_index);
|
||||
|
||||
// iterate over groups by windows of size 2
|
||||
let mut last = iter.next();
|
||||
let mut index = 0;
|
||||
let mut iter = query_index.linear_group_by(PartialEq::eq);
|
||||
let mut last = iter.next().map(|group| {
|
||||
let len = group.len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
index += len;
|
||||
|
||||
(rattr, rwi)
|
||||
});
|
||||
|
||||
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
||||
let len = rhs.len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
let rhs = (rattr, rwi);
|
||||
|
||||
proximity += min_proximity(lhs, rhs);
|
||||
last = Some(rhs);
|
||||
index += len;
|
||||
}
|
||||
|
||||
proximity
|
||||
@ -51,18 +70,26 @@ fn matches_proximity(matches: &[Match]) -> u32 {
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WordsProximity;
|
||||
|
||||
impl<D> Criterion<D> for WordsProximity
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = matches_proximity(&lhs.matches);
|
||||
let rhs = matches_proximity(&rhs.matches);
|
||||
impl Criterion for WordsProximity {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let attribute = lhs.attribute();
|
||||
let word_index = lhs.word_index();
|
||||
matches_proximity(query_index, attribute, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let attribute = rhs.attribute();
|
||||
let word_index = rhs.word_index();
|
||||
matches_proximity(query_index, attribute, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@ -78,18 +105,14 @@ mod tests {
|
||||
// { id: 2, attr: 2, attr_index: 0 }
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() },
|
||||
];
|
||||
let query_index = &[0, 1, 2, 2, 3];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(matches_proximity(matches), 17);
|
||||
assert_eq!(matches_proximity(query_index, attribute, word_index), 17);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -104,18 +127,13 @@ mod tests {
|
||||
// { id: 3, attr: 0, attr_index: 1 }
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() },
|
||||
];
|
||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(matches_proximity(matches), 3);
|
||||
assert_eq!(matches_proximity(query_index, attribute, word_index), 3);
|
||||
}
|
||||
}
|
||||
|
182
src/rank/mod.rs
182
src/rank/mod.rs
@ -2,32 +2,182 @@ pub mod criterion;
|
||||
mod query_builder;
|
||||
mod distinct_map;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
|
||||
use crate::{Match, DocumentId};
|
||||
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
|
||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||
|
||||
#[inline]
|
||||
fn match_query_index(a: &Match, b: &Match) -> bool {
|
||||
a.query_index == b.query_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Document {
|
||||
pub id: DocumentId,
|
||||
pub matches: Vec<Match>,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
pub fn new(doc: DocumentId, match_: Match) -> Self {
|
||||
unsafe { Self::from_sorted_matches(doc, vec![match_]) }
|
||||
}
|
||||
fn from_raw(raw: &RawDocument) -> Document {
|
||||
let len = raw.matches.range.len();
|
||||
let mut matches = Vec::with_capacity(len);
|
||||
|
||||
pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self {
|
||||
matches.sort_unstable();
|
||||
unsafe { Self::from_sorted_matches(doc, matches) }
|
||||
}
|
||||
let query_index = raw.query_index();
|
||||
let distance = raw.distance();
|
||||
let attribute = raw.attribute();
|
||||
let word_index = raw.word_index();
|
||||
let is_exact = raw.is_exact();
|
||||
let char_index = raw.char_index();
|
||||
let char_length = raw.char_length();
|
||||
|
||||
pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self {
|
||||
Self { id, matches }
|
||||
for i in 0..len {
|
||||
let match_ = Match {
|
||||
query_index: query_index[i],
|
||||
distance: distance[i],
|
||||
attribute: attribute[i],
|
||||
word_index: word_index[i],
|
||||
is_exact: is_exact[i],
|
||||
char_index: char_index[i],
|
||||
char_length: char_length[i],
|
||||
};
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
Document { id: raw.id, matches }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
pub id: DocumentId,
|
||||
pub matches: SharedMatches,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
|
||||
RawDocument { id, matches: SharedMatches { range, matches } }
|
||||
}
|
||||
|
||||
pub fn query_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn word_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn char_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn char_length(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
|
||||
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
matches.par_sort_unstable();
|
||||
|
||||
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
|
||||
let id = group[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
|
||||
let end = start + group.len();
|
||||
docs_ranges.push((id, Range { start, end }));
|
||||
|
||||
matches2.extend_from_slice(group);
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Range {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
impl Range {
|
||||
fn len(self) -> usize {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedMatches {
|
||||
range: Range,
|
||||
matches: Arc<Matches>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Matches {
|
||||
query_index: Vec<u32>,
|
||||
distance: Vec<u8>,
|
||||
attribute: Vec<u16>,
|
||||
word_index: Vec<u32>,
|
||||
is_exact: Vec<bool>,
|
||||
char_index: Vec<u32>,
|
||||
char_length: Vec<u16>,
|
||||
}
|
||||
|
||||
impl Matches {
|
||||
fn with_capacity(cap: usize) -> Matches {
|
||||
Matches {
|
||||
query_index: Vec::with_capacity(cap),
|
||||
distance: Vec::with_capacity(cap),
|
||||
attribute: Vec::with_capacity(cap),
|
||||
word_index: Vec::with_capacity(cap),
|
||||
is_exact: Vec::with_capacity(cap),
|
||||
char_index: Vec::with_capacity(cap),
|
||||
char_length: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
|
||||
for (_, match_) in matches {
|
||||
self.query_index.push(match_.query_index);
|
||||
self.distance.push(match_.distance);
|
||||
self.attribute.push(match_.attribute);
|
||||
self.word_index.push(match_.word_index);
|
||||
self.is_exact.push(match_.is_exact);
|
||||
self.char_index.push(match_.char_index);
|
||||
self.char_length.push(match_.char_length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,17 +4,20 @@ use std::error::Error;
|
||||
use std::hash::Hash;
|
||||
use std::rc::Rc;
|
||||
|
||||
use group_by::GroupByMut;
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use slice_group_by::GroupByMut;
|
||||
use elapsed::measure_time;
|
||||
use hashbrown::HashMap;
|
||||
use fst::Streamer;
|
||||
use rocksdb::DB;
|
||||
use log::info;
|
||||
|
||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
use crate::rank::criterion::Criteria;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::{Match, DocumentId};
|
||||
use crate::rank::Document;
|
||||
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
||||
|
||||
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
@ -34,34 +37,45 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||
automatons
|
||||
}
|
||||
|
||||
pub struct QueryBuilder<'a, D>
|
||||
pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
|
||||
|
||||
pub struct QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
view: &'a DatabaseView<D>,
|
||||
criteria: Criteria<D>,
|
||||
criteria: Criteria,
|
||||
filter: Option<FI>,
|
||||
}
|
||||
|
||||
impl<'a, D> QueryBuilder<'a, D>
|
||||
impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
|
||||
QueryBuilder::with_criteria(view, Criteria::default())
|
||||
}
|
||||
|
||||
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria) -> Result<Self, Box<Error>> {
|
||||
Ok(QueryBuilder { view, criteria, filter: None })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D> QueryBuilder<'a, D>
|
||||
where D: Deref<Target=DB>
|
||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>,
|
||||
{
|
||||
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> {
|
||||
Ok(QueryBuilder { view, criteria })
|
||||
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
{
|
||||
QueryBuilder {
|
||||
view: self.view,
|
||||
criteria: self.criteria,
|
||||
filter: Some(function)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn criteria(&mut self, criteria: Criteria<D>) -> &mut Self {
|
||||
self.criteria = criteria;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_distinct<F>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, F> {
|
||||
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
inner: self,
|
||||
function: function,
|
||||
@ -69,19 +83,19 @@ where D: Deref<Target=DB>
|
||||
}
|
||||
}
|
||||
|
||||
fn query_all(&self, query: &str) -> Vec<Document> {
|
||||
fn query_all(&self, query: &str) -> Vec<RawDocument> {
|
||||
let automatons = split_whitespace_automatons(query);
|
||||
|
||||
let mut stream = {
|
||||
let mut op_builder = fst::map::OpBuilder::new();
|
||||
for automaton in &automatons {
|
||||
let stream = self.view.blob().as_map().search(automaton);
|
||||
let stream = self.view.index().positive.map().search(automaton);
|
||||
op_builder.push(stream);
|
||||
}
|
||||
op_builder.union()
|
||||
};
|
||||
|
||||
let mut matches = HashMap::new();
|
||||
let mut matches = Vec::new();
|
||||
|
||||
while let Some((input, indexed_values)) = stream.next() {
|
||||
for iv in indexed_values {
|
||||
@ -89,7 +103,7 @@ where D: Deref<Target=DB>
|
||||
let distance = automaton.eval(input).to_u8();
|
||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||
|
||||
let doc_indexes = self.view.blob().as_indexes();
|
||||
let doc_indexes = &self.view.index().positive.indexes();
|
||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||
|
||||
for doc_index in doc_indexes {
|
||||
@ -97,31 +111,50 @@ where D: Deref<Target=DB>
|
||||
query_index: iv.index as u32,
|
||||
distance: distance,
|
||||
attribute: doc_index.attribute,
|
||||
attribute_index: doc_index.attribute_index,
|
||||
word_index: doc_index.word_index,
|
||||
is_exact: is_exact,
|
||||
char_index: doc_index.char_index,
|
||||
char_length: doc_index.char_length,
|
||||
};
|
||||
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
|
||||
matches.push((doc_index.document_id, match_));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matches.into_iter().map(|(id, matches)| Document::from_matches(id, matches)).collect()
|
||||
let total_matches = matches.len();
|
||||
let raw_documents = raw_documents_from_matches(matches);
|
||||
|
||||
info!("{} total documents to classify", raw_documents.len());
|
||||
info!("{} total matches to classify", total_matches);
|
||||
|
||||
raw_documents
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D> QueryBuilder<'a, D>
|
||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>,
|
||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
{
|
||||
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
let mut documents = self.query_all(query);
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
let view = &self.view;
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
// We delegate the filter work to the distinct query builder,
|
||||
// specifying a distinct rule that has no effect.
|
||||
if self.filter.is_some() {
|
||||
let builder = self.with_distinct(|_, _| None as Option<()>, 1);
|
||||
return builder.query(query, range);
|
||||
}
|
||||
|
||||
'criteria: for criterion in self.criteria.as_ref() {
|
||||
let (elapsed, mut documents) = measure_time(|| self.query_all(query));
|
||||
info!("query_all took {}", elapsed);
|
||||
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
|
||||
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < range.start {
|
||||
@ -130,9 +163,12 @@ where D: Deref<Target=DB>,
|
||||
continue;
|
||||
}
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
||||
let (elapsed, _) = measure_time(|| {
|
||||
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
});
|
||||
info!("criterion {} sort took {}", ci, elapsed);
|
||||
|
||||
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
@ -143,46 +179,63 @@ where D: Deref<Target=DB>,
|
||||
}
|
||||
}
|
||||
|
||||
// `drain` removes the documents efficiently using `ptr::copy`
|
||||
// TODO it could be more efficient to have a custom iterator
|
||||
let offset = cmp::min(documents.len(), range.start);
|
||||
documents.drain(0..offset);
|
||||
documents.truncate(range.len());
|
||||
documents
|
||||
let iter = documents.into_iter().skip(offset).take(range.len());
|
||||
iter.map(|d| Document::from_raw(&d)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DistinctQueryBuilder<'a, D, F>
|
||||
pub struct DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
inner: QueryBuilder<'a, D>,
|
||||
function: F,
|
||||
inner: QueryBuilder<'a, D, FI>,
|
||||
function: FD,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
impl<'a, D, F, K> DistinctQueryBuilder<'a, D, F>
|
||||
impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>,
|
||||
F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
{
|
||||
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
inner: self.inner.with_filter(function),
|
||||
function: self.function,
|
||||
size: self.size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>,
|
||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
let mut documents = self.inner.query_all(query);
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
let (elapsed, mut documents) = measure_time(|| self.inner.query_all(query));
|
||||
info!("query_all took {}", elapsed);
|
||||
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
let mut key_cache = HashMap::new();
|
||||
let view = &self.inner.view;
|
||||
|
||||
let mut filter_map = HashMap::new();
|
||||
// these two variables informs on the current distinct map and
|
||||
// on the raw offset of the start of the group where the
|
||||
// range.start bound is located according to the distinct function
|
||||
let mut distinct_map = DistinctMap::new(self.size);
|
||||
let mut distinct_raw_offset = 0;
|
||||
|
||||
'criteria: for criterion in self.inner.criteria.as_ref() {
|
||||
'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < distinct_raw_offset {
|
||||
@ -191,19 +244,32 @@ where D: Deref<Target=DB>,
|
||||
continue;
|
||||
}
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
||||
let (elapsed, _) = measure_time(|| {
|
||||
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
});
|
||||
info!("criterion {} sort took {}", ci, elapsed);
|
||||
|
||||
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
// we must compute the real distinguished len of this sub-group
|
||||
for document in group.iter() {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
None => buf_distinct.register_without_key(),
|
||||
let filter_accepted = match &self.inner.filter {
|
||||
Some(filter) => {
|
||||
let entry = filter_map.entry(document.id);
|
||||
*entry.or_insert_with(|| (filter)(document.id, view))
|
||||
},
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
None => buf_distinct.register_without_key(),
|
||||
};
|
||||
}
|
||||
|
||||
// the requested range end is reached: stop computing distinct
|
||||
if buf_distinct.len() >= range.end { break }
|
||||
}
|
||||
@ -229,16 +295,22 @@ where D: Deref<Target=DB>,
|
||||
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
|
||||
|
||||
for document in documents.into_iter().skip(distinct_raw_offset) {
|
||||
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
|
||||
|
||||
let accepted = match key {
|
||||
Some(key) => seen.register(key),
|
||||
None => seen.register_without_key(),
|
||||
let filter_accepted = match &self.inner.filter {
|
||||
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
|
||||
None => true,
|
||||
};
|
||||
|
||||
if accepted && seen.len() > range.start {
|
||||
out_documents.push(document);
|
||||
if out_documents.len() == range.len() { break }
|
||||
if filter_accepted {
|
||||
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
|
||||
let distinct_accepted = match key {
|
||||
Some(key) => seen.register(key),
|
||||
None => seen.register_without_key(),
|
||||
};
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
out_documents.push(Document::from_raw(&document));
|
||||
if out_documents.len() == range.len() { break }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@ use std::mem;
|
||||
use self::Separator::*;
|
||||
|
||||
pub trait TokenizerBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
|
||||
}
|
||||
|
||||
pub struct DefaultBuilder;
|
||||
@ -13,22 +13,39 @@ impl DefaultBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Token<'a> {
|
||||
pub word: &'a str,
|
||||
pub word_index: usize,
|
||||
pub char_index: usize,
|
||||
}
|
||||
|
||||
impl TokenizerBuilder for DefaultBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
|
||||
Box::new(Tokenizer::new(text))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Tokenizer<'a> {
|
||||
index: usize,
|
||||
word_index: usize,
|
||||
char_index: usize,
|
||||
inner: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> {
|
||||
pub fn new(string: &str) -> Tokenizer {
|
||||
let mut char_advance = 0;
|
||||
let mut index_advance = 0;
|
||||
for (n, (i, c)) in string.char_indices().enumerate() {
|
||||
char_advance = n;
|
||||
index_advance = i;
|
||||
if detect_separator(c).is_none() { break }
|
||||
}
|
||||
|
||||
Tokenizer {
|
||||
index: 0,
|
||||
inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]),
|
||||
word_index: 0,
|
||||
char_index: char_advance,
|
||||
inner: &string[index_advance..],
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -56,43 +73,58 @@ impl Separator {
|
||||
}
|
||||
}
|
||||
|
||||
fn detect_separator(c: char) -> Option<Separator> {
|
||||
match c {
|
||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
||||
' ' | '\'' | '"' => Some(Short),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Tokenizer<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
type Item = Token<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut start_word = None;
|
||||
let mut distance = None;
|
||||
|
||||
for (i, c) in self.inner.char_indices() {
|
||||
let separator = match c {
|
||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
||||
' ' | '\'' | '"' => Some(Short),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
match separator {
|
||||
Some(dist) => {
|
||||
match detect_separator(c) {
|
||||
Some(sep) => {
|
||||
if let Some(start_word) = start_word {
|
||||
let (word, tail) = self.inner.split_at(i);
|
||||
let (prefix, tail) = self.inner.split_at(i);
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
self.inner = tail;
|
||||
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
self.char_index += spaces.chars().count();
|
||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let word = &word[start_word..];
|
||||
return Some((self.index, word))
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index,
|
||||
char_index: self.char_index,
|
||||
};
|
||||
|
||||
self.char_index += word.chars().count();
|
||||
return Some(token)
|
||||
}
|
||||
distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
|
||||
|
||||
distance.replace(distance.map_or(sep, |s| s.add(sep)));
|
||||
},
|
||||
None => { start_word.get_or_insert(i); },
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(start_word) = start_word {
|
||||
let word = mem::replace(&mut self.inner, "");
|
||||
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
let prefix = mem::replace(&mut self.inner, "");
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
let word = &word[start_word..];
|
||||
return Some((self.index, word))
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
|
||||
char_index: self.char_index + spaces.chars().count(),
|
||||
};
|
||||
return Some(token)
|
||||
}
|
||||
|
||||
None
|
||||
@ -107,12 +139,12 @@ mod tests {
|
||||
fn easy() {
|
||||
let mut tokenizer = Tokenizer::new("salut");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "salut")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
@ -120,18 +152,37 @@ mod tests {
|
||||
fn hard() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
||||
assert_eq!(tokenizer.next(), Some((1, "lolo")));
|
||||
assert_eq!(tokenizer.next(), Some((9, "aïe")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
||||
assert_eq!(tokenizer.next(), Some((8, "lolo")));
|
||||
assert_eq!(tokenizer.next(), Some((16, "wtf")));
|
||||
assert_eq!(tokenizer.next(), Some((24, "lol")));
|
||||
assert_eq!(tokenizer.next(), Some((32, "aïe")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard_long_chars() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
}
|
||||
|
@ -1,51 +0,0 @@
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct VecReadOnly<T> {
|
||||
inner: Arc<Vec<T>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<T> VecReadOnly<T> {
|
||||
pub fn new(vec: Vec<T>) -> Self {
|
||||
let len = vec.len();
|
||||
Self {
|
||||
inner: Arc::new(vec),
|
||||
offset: 0,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> Self {
|
||||
Self {
|
||||
inner: self.inner.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[T] {
|
||||
&self.inner[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for VecReadOnly<T> {
|
||||
type Target = [T];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: fmt::Debug> fmt::Debug for VecReadOnly<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.inner.fmt(f)
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user