mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-18 20:30:47 +00:00
Compare commits
106 Commits
Author | SHA1 | Date | |
---|---|---|---|
810dfdf656 | |||
f016652fca | |||
6c99ebe3fa | |||
94d357985f | |||
fbc698567a | |||
aa9db14c09 | |||
61e83a1c21 | |||
1316be5b09 | |||
4e8b0383dd | |||
4fa10753c1 | |||
2473e289e8 | |||
e0e5e87ed3 | |||
b13e61f40a | |||
c023cb3065 | |||
0a3d069fbc | |||
fa062ce2cf | |||
cdc6e47bf5 | |||
d5f44838be | |||
5939f6e68a | |||
97edc987f8 | |||
e4e50cecce | |||
77e0c19749 | |||
251bccbbc3 | |||
f7561f8552 | |||
05fd7e87ec | |||
446d6a5455 | |||
78786a0007 | |||
3d820a27ee | |||
ac347d788c | |||
5627f15d41 | |||
e31afc2da2 | |||
77c252e12a | |||
30c9c053c2 | |||
b53ef08d05 | |||
86bfb173ef | |||
8e5f834625 | |||
563b021679 | |||
681f721b1d | |||
8a7c061539 | |||
8c781a4d05 | |||
de59ea495d | |||
966eda8ae5 | |||
32f8908d71 | |||
a2f5e8aa25 | |||
f00b978801 | |||
a78b5d225f | |||
f32a59720d | |||
2cc5fbde1a | |||
34d2850d28 | |||
023f62b0ce | |||
7f35b971f0 | |||
3418adb06a | |||
510426c05c | |||
c74caa0f82 | |||
d899b86603 | |||
0d07af3caf | |||
c594597a01 | |||
ef7ba96d4a | |||
d21406a939 | |||
039a9a4cc7 | |||
40ab9e7a55 | |||
d21abb50fa | |||
3dd5e2445a | |||
7f5e6c5b6e | |||
e6d3840f12 | |||
c05fab783a | |||
95dc6fe904 | |||
b2e9ae4136 | |||
b070778d44 | |||
6731025003 | |||
04544c1531 | |||
9dd68b4eaa | |||
1d67012aa5 | |||
e723e01ec8 | |||
7845292ea8 | |||
521df85c0d | |||
dfa19582a2 | |||
87ec95f7a0 | |||
76ef2cceeb | |||
20b5a6a06e | |||
a842e647f7 | |||
21bb38c3b0 | |||
64d53ee1bd | |||
c022fa3fca | |||
0080bf486f | |||
6bd779f9ae | |||
a18401f47e | |||
7132c3be89 | |||
aa3d059363 | |||
e2a9dbc404 | |||
a0a11faee5 | |||
36ef9581aa | |||
f4b04dfb72 | |||
cf5d56e63a | |||
8412c14b5b | |||
70772eca5c | |||
b27f632e14 | |||
e3bfb866e5 | |||
fa238f21ef | |||
444a4c1af7 | |||
2e5c5fad33 | |||
b32c96cdc9 | |||
62521262e8 | |||
4ebae7784c | |||
a756ca5e3f | |||
aa104fa253 |
36
Cargo.toml
36
Cargo.toml
@ -1,39 +1,61 @@
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "meilidb"
|
||||
version = "0.1.0"
|
||||
version = "0.2.1"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
||||
[dependencies]
|
||||
bincode = "1.0"
|
||||
byteorder = "1.2"
|
||||
crossbeam = "0.6"
|
||||
elapsed = "0.1"
|
||||
fst = "0.3"
|
||||
hashbrown = "0.1"
|
||||
hashbrown = { version = "0.1", features = ["serde"] }
|
||||
lazy_static = "1.1"
|
||||
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
|
||||
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
|
||||
log = "0.4"
|
||||
sdset = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = { version = "1.0", features = ["preserve_order"] }
|
||||
unidecode = "0.3"
|
||||
|
||||
[dependencies.toml]
|
||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||
features = ["preserve_order"]
|
||||
rev = "0372ba6"
|
||||
|
||||
[dependencies.rocksdb]
|
||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||
rev = "c2eb140"
|
||||
rev = "306e201"
|
||||
|
||||
[dependencies.group-by]
|
||||
git = "https://github.com/Kerollmops/group-by.git"
|
||||
rev = "cab857b"
|
||||
rev = "5a113fe"
|
||||
|
||||
[features]
|
||||
default = ["simd"]
|
||||
i128 = ["bincode/i128", "byteorder/i128"]
|
||||
simd = ["rocksdb/sse"]
|
||||
portable = ["rocksdb/portable"]
|
||||
nightly = []
|
||||
simd = ["rocksdb/sse"]
|
||||
nightly = ["hashbrown/nightly", "group-by/nightly"]
|
||||
|
||||
[dev-dependencies]
|
||||
csv = "1.0"
|
||||
elapsed = "0.1"
|
||||
env_logger = "0.6"
|
||||
jemallocator = "0.1"
|
||||
quickcheck = "0.8"
|
||||
rand = "0.6"
|
||||
rand_xorshift = "0.1"
|
||||
structopt = "0.2"
|
||||
tempfile = "3.0"
|
||||
termcolor = "1.0"
|
||||
warp = "0.1"
|
||||
|
||||
[dev-dependencies.chashmap]
|
||||
git = "https://gitlab.redox-os.org/redox-os/tfs.git"
|
||||
rev = "b3e7cae1"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
|
49
README.md
49
README.md
@ -1,47 +1,60 @@
|
||||
# MeiliDB
|
||||
|
||||
[](https://travis-ci.org/Kerollmops/MeiliDB)
|
||||
[](https://deps.rs/repo/github/Kerollmops/MeiliDB)
|
||||
[](https://github.com/Kerollmops/MeiliDB)
|
||||
[](
|
||||
https://www.rust-lang.org)
|
||||
|
||||
A _full-text search database_ using a key-value store internally.
|
||||
|
||||
It uses [RocksDB](https://github.com/facebook/rocksdb) like a classic database, to store documents and internal data. The key-value store power allow us to handle updates and queries with small memory and CPU overheads.
|
||||
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
|
||||
|
||||
You can [read the deep dive](deep-dive.md) if you want more informations on the engine, it describes the whole process of generating updates and handling queries.
|
||||
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.
|
||||
|
||||
We will be proud if you send pull requests to help us grow this project, you can start with [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) to start !
|
||||
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
|
||||
|
||||
At the moment this is a library only, this means that binaries are not part of this repository but since I'm still nice I have made some examples for you in the `examples/` folder that works with the data located in the `misc/` folder.
|
||||
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.
|
||||
|
||||
In a near future MeiliDB we be a binary like any database: updated and queried using some kind of protocol. It is the final goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). MeiliDB will just be a bunch of network and protocols functions wrapping the library which itself will be published to https://crates.io, following the same update cycle.
|
||||
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/Kerollmops/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
|
||||
|
||||
|
||||
|
||||
## Performances
|
||||
|
||||
_these informations have been made with a version dated of october 2018, we must update them_
|
||||
With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
|
||||
So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.
|
||||
|
||||
We made some tests on remote machines and found that we can handle with a dataset of near 280k products, on a server that cost 5$/month with 1vCPU and 1GB of ram and on the same index and with a simple query:
|
||||
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.
|
||||
|
||||
- near 190 users with an average response time of 90ms
|
||||
- 150 users with an average response time of 70ms
|
||||
- 100 users with an average response time of 45ms
|
||||
|
||||
Network is mesured, servers are located in amsterdam and tests are made between two different datacenters.
|
||||
```
|
||||
Running 10s test @ http://localhost:2230
|
||||
2 threads and 12 connections
|
||||
Thread Stats Avg Stdev Max +/- Stdev
|
||||
Latency 18.86ms 49.39ms 614.89ms 95.23%
|
||||
Req/Sec 620.41 59.53 790.00 65.00%
|
||||
12359 requests in 10.00s, 3.26MB read
|
||||
Requests/sec: 1235.54
|
||||
Transfer/sec: 334.22KB
|
||||
```
|
||||
|
||||
### Notes
|
||||
|
||||
The default Rust allocator has recently been [changed to use the system allocator](https://github.com/rust-lang/rust/pull/51241/).
|
||||
We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).
|
||||
|
||||
## Usage and examples
|
||||
|
||||
MeiliDB work with an index like most of the search engines.
|
||||
MeiliDB runs with an index like most search engines.
|
||||
So to test the library you can create one by indexing a simple csv file.
|
||||
|
||||
```bash
|
||||
cargo run --release --example create-database -- test.mdb misc/kaggle.csv
|
||||
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
|
||||
```
|
||||
|
||||
Once the command finished indexing the database should have been saved under the `test.mdb` folder.
|
||||
|
||||
Now you can easily run the `query-database` example to check what is stored in it.
|
||||
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
|
||||
|
||||
```bash
|
||||
cargo run --release --example query-database -- test.mdb
|
||||
cargo run --release --example query-database -- test.mdb -n 10 id title
|
||||
```
|
||||
|
||||
|
@ -1,91 +1,132 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::error::Error;
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::error::Error;
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED};
|
||||
use meilidb::database::update::PositiveUpdateBuilder;
|
||||
use meilidb::database::{Database, Schema, UpdateBuilder};
|
||||
use meilidb::tokenizer::DefaultBuilder;
|
||||
use meilidb::database::Database;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Opt {
|
||||
/// The destination where the database must be created
|
||||
/// The destination where the database must be created.
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
/// The csv file to index.
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub csv_data_path: PathBuf,
|
||||
|
||||
/// The path to the schema.
|
||||
#[structopt(long = "schema", parse(from_os_str))]
|
||||
pub schema_path: PathBuf,
|
||||
|
||||
/// The path to the list of stop words (one by line).
|
||||
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||
pub stop_words_path: Option<PathBuf>,
|
||||
|
||||
#[structopt(long = "update-group-size")]
|
||||
pub update_group_size: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Document<'a> {
|
||||
id: &'a str,
|
||||
title: &'a str,
|
||||
description: &'a str,
|
||||
image: &'a str,
|
||||
}
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct Document<'a> (
|
||||
#[serde(borrow)]
|
||||
HashMap<Cow<'a, str>, Cow<'a, str>>
|
||||
);
|
||||
|
||||
fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s = DefaultHasher::new();
|
||||
t.hash(&mut s);
|
||||
s.finish()
|
||||
}
|
||||
|
||||
fn create_schema() -> Schema {
|
||||
let mut schema = SchemaBuilder::new();
|
||||
schema.new_attribute("id", STORED);
|
||||
schema.new_attribute("title", STORED | INDEXED);
|
||||
schema.new_attribute("description", STORED | INDEXED);
|
||||
schema.new_attribute("image", STORED);
|
||||
schema.build()
|
||||
}
|
||||
|
||||
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
|
||||
let database = Database::create(database_path, schema.clone())?;
|
||||
|
||||
println!("start indexing...");
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = tempfile::NamedTempFile::new()?;
|
||||
let mut update = PositiveUpdateBuilder::new(update_path.path(), schema, tokenizer_builder);
|
||||
fn index(
|
||||
schema: Schema,
|
||||
database_path: &Path,
|
||||
csv_data_path: &Path,
|
||||
update_group_size: Option<usize>,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<Database, Box<Error>>
|
||||
{
|
||||
let database = Database::create(database_path, &schema)?;
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
let headers = rdr.headers()?.clone();
|
||||
|
||||
while rdr.read_record(&mut raw_record)? {
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
Err(e) => {
|
||||
eprintln!("{:?}", e);
|
||||
continue;
|
||||
let mut i = 0;
|
||||
let mut end_of_file = false;
|
||||
|
||||
while !end_of_file {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = tempfile::NamedTempFile::new()?;
|
||||
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
|
||||
|
||||
loop {
|
||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||
if end_of_file { break }
|
||||
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
Err(e) => {
|
||||
eprintln!("{:?}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
update.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
print!("\rindexing document {}", i);
|
||||
i += 1;
|
||||
|
||||
if let Some(group_size) = update_group_size {
|
||||
if i % group_size == 0 { break }
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let document_id = calculate_hash(&document.id);
|
||||
update.update(document_id, &document).unwrap();
|
||||
println!();
|
||||
|
||||
println!("building update...");
|
||||
let update = update.build()?;
|
||||
println!("ingesting update...");
|
||||
database.ingest_update_file(update)?;
|
||||
}
|
||||
|
||||
let mut update = update.build()?;
|
||||
|
||||
update.set_move(true);
|
||||
database.ingest_update_file(update)?;
|
||||
|
||||
Ok(database)
|
||||
}
|
||||
|
||||
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
|
||||
let f = File::open(path)?;
|
||||
let reader = BufReader::new(f);
|
||||
let mut words = HashSet::new();
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
let word = line.trim().to_string();
|
||||
words.insert(word);
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let schema = create_schema();
|
||||
let schema = {
|
||||
let file = File::open(&opt.schema_path)?;
|
||||
Schema::from_toml(file)?
|
||||
};
|
||||
|
||||
let stop_words = match opt.stop_words_path {
|
||||
Some(ref path) => retrieve_stop_words(path)?,
|
||||
None => HashSet::new(),
|
||||
};
|
||||
|
||||
let (elapsed, result) = elapsed::measure_time(|| {
|
||||
index(schema, &opt.database_path, &opt.csv_data_path)
|
||||
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
@ -93,6 +134,5 @@ fn main() -> Result<(), Box<Error>> {
|
||||
}
|
||||
|
||||
println!("database created in {} at: {:?}", elapsed, opt.database_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
435
examples/http-server.rs
Normal file
435
examples/http-server.rs
Normal file
@ -0,0 +1,435 @@
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use log::{error, info};
|
||||
use std::error::Error;
|
||||
use std::ffi::OsStr;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{PathBuf, Path};
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use chashmap::CHashMap;
|
||||
use chashmap::ReadGuard;
|
||||
use elapsed::measure_time;
|
||||
use meilidb::database::Database;
|
||||
use meilidb::database::UpdateBuilder;
|
||||
use meilidb::database::schema::Schema;
|
||||
use meilidb::database::schema::SchemaBuilder;
|
||||
use meilidb::tokenizer::DefaultBuilder;
|
||||
use serde_derive::Deserialize;
|
||||
use serde_derive::Serialize;
|
||||
use structopt::StructOpt;
|
||||
use warp::{Rejection, Filter};
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Opt {
|
||||
/// The destination where the database must be created.
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
/// The address and port to bind the server to.
|
||||
#[structopt(short = "l", default_value = "127.0.0.1:8080")]
|
||||
pub listen_addr: SocketAddr,
|
||||
|
||||
/// The path to the list of stop words (one by line).
|
||||
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||
pub stop_words: PathBuf,
|
||||
}
|
||||
|
||||
//
|
||||
// ERRORS FOR THE MULTIDATABASE
|
||||
//
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DatabaseError {
|
||||
AlreadyExist,
|
||||
NotExist,
|
||||
NotFound(String),
|
||||
Unknown(Box<Error>),
|
||||
}
|
||||
|
||||
impl fmt::Display for DatabaseError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
DatabaseError::AlreadyExist => write!(f, "File already exist"),
|
||||
DatabaseError::NotExist => write!(f, "File not exist"),
|
||||
DatabaseError::NotFound(ref name) => write!(f, "Database {} not found", name),
|
||||
DatabaseError::Unknown(e) => write!(f, "{}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for DatabaseError {}
|
||||
|
||||
impl From<Box<Error>> for DatabaseError {
|
||||
fn from(e: Box<Error>) -> DatabaseError {
|
||||
DatabaseError::Unknown(e)
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// MULTIDATABASE DEFINITION
|
||||
//
|
||||
|
||||
pub struct MultiDatabase {
|
||||
databases: CHashMap<String, Database>,
|
||||
db_path: PathBuf,
|
||||
stop_words: HashSet<String>,
|
||||
}
|
||||
|
||||
impl MultiDatabase {
|
||||
|
||||
pub fn new(path: PathBuf, stop_words: HashSet<String>) -> MultiDatabase {
|
||||
MultiDatabase {
|
||||
databases: CHashMap::new(),
|
||||
db_path: path,
|
||||
stop_words: stop_words
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create(&self, name: String, schema: Schema) -> Result<(), DatabaseError> {
|
||||
let rdb_name = format!("{}.mdb", name);
|
||||
let database_path = self.db_path.join(rdb_name);
|
||||
|
||||
if database_path.exists() {
|
||||
return Err(DatabaseError::AlreadyExist.into());
|
||||
}
|
||||
|
||||
let index = Database::create(database_path, &schema)?;
|
||||
|
||||
self.databases.insert_new(name, index);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn load(&self, name: String) -> Result<(), DatabaseError> {
|
||||
let rdb_name = format!("{}.mdb", name);
|
||||
let index_path = self.db_path.join(rdb_name);
|
||||
|
||||
if !index_path.exists() {
|
||||
return Err(DatabaseError::NotExist.into());
|
||||
}
|
||||
|
||||
let index = Database::open(index_path)?;
|
||||
|
||||
self.databases.insert_new(name, index);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn load_existing(&self) {
|
||||
let paths = match fs::read_dir(self.db_path.clone()){
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
error!("{}", e);
|
||||
return
|
||||
}
|
||||
};
|
||||
|
||||
for path in paths {
|
||||
let path = match path {
|
||||
Ok(p) => p.path(),
|
||||
Err(_) => continue
|
||||
};
|
||||
|
||||
let path_str = match path.to_str() {
|
||||
Some(p) => p,
|
||||
None => continue
|
||||
};
|
||||
|
||||
let extension = match get_extension_from_path(path_str) {
|
||||
Some(e) => e,
|
||||
None => continue
|
||||
};
|
||||
|
||||
if extension != "mdb" {
|
||||
continue
|
||||
}
|
||||
|
||||
let name = match get_file_name_from_path(path_str) {
|
||||
Some(f) => f,
|
||||
None => continue
|
||||
};
|
||||
|
||||
let db = match Database::open(path.clone()) {
|
||||
Ok(db) => db,
|
||||
Err(_) => continue
|
||||
};
|
||||
|
||||
self.databases.insert_new(name.to_string(), db);
|
||||
info!("Load database {}", name);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_or_load(&self, name: String, schema: Schema) -> Result<(), DatabaseError> {
|
||||
match self.create(name.clone(), schema) {
|
||||
Err(DatabaseError::AlreadyExist) => self.load(name),
|
||||
x => x,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, name: String) -> Result<ReadGuard<String, Database>, Box<Error>> {
|
||||
Ok(self.databases.get(&name).ok_or(DatabaseError::NotFound(name))?)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_extension_from_path(path: &str) -> Option<&str> {
|
||||
Path::new(path).extension().and_then(OsStr::to_str)
|
||||
}
|
||||
|
||||
fn get_file_name_from_path(path: &str) -> Option<&str> {
|
||||
Path::new(path).file_stem().and_then(OsStr::to_str)
|
||||
}
|
||||
|
||||
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
|
||||
let f = File::open(path)?;
|
||||
let reader = BufReader::new(f);
|
||||
let mut words = HashSet::new();
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
let word = line.trim().to_string();
|
||||
words.insert(word);
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
|
||||
//
|
||||
// PARAMS & BODY FOR HTTPS HANDLERS
|
||||
//
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CreateBody {
|
||||
name: String,
|
||||
schema: SchemaBuilder,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct IngestBody {
|
||||
insert: Option<Vec<HashMap<String, String>>>,
|
||||
delete: Option<Vec<HashMap<String, String>>>
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct IngestResponse {
|
||||
inserted: usize,
|
||||
deleted: usize
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct SearchQuery {
|
||||
q: String,
|
||||
limit: Option<usize>,
|
||||
}
|
||||
|
||||
//
|
||||
// HTTP ROUTES
|
||||
//
|
||||
|
||||
// Create a new index.
|
||||
// The index name should be unused and the schema valid.
|
||||
//
|
||||
// POST /create
|
||||
// Body:
|
||||
// - name: String
|
||||
// - schema: JSON
|
||||
// - stopwords: Vec<String>
|
||||
fn create(body: CreateBody, db: Arc<MultiDatabase>) -> Result<String, Rejection> {
|
||||
let schema = body.schema.build();
|
||||
|
||||
match db.create(body.name.clone(), schema) {
|
||||
Ok(_) => Ok(format!("{} created ", body.name)),
|
||||
Err(e) => {
|
||||
error!("{:?}", e);
|
||||
return Err(warp::reject::not_found())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ingest new document.
|
||||
// It's possible to have positive or/and negative updates.
|
||||
//
|
||||
// PUT /:name/ingest
|
||||
// Body:
|
||||
// - insert: Option<Vec<JSON>>
|
||||
// - delete: Option<Vec<String>>
|
||||
fn ingest(index_name: String, body: IngestBody, db: Arc<MultiDatabase>) -> Result<String, Rejection> {
|
||||
|
||||
let schema = {
|
||||
let index = match db.get(index_name.clone()){
|
||||
Ok(i) => i,
|
||||
Err(_) => return Err(warp::reject::not_found()),
|
||||
};
|
||||
let view = index.view();
|
||||
|
||||
view.schema().clone()
|
||||
};
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let now = match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => n.as_secs(),
|
||||
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
|
||||
};
|
||||
|
||||
let sst_name = format!("update-{}-{}.sst", index_name, now);
|
||||
let sst_path = db.db_path.join(sst_name);
|
||||
|
||||
let mut response = IngestResponse{inserted: 0, deleted: 0};
|
||||
let mut update = UpdateBuilder::new(sst_path, schema);
|
||||
|
||||
if let Some(documents) = body.delete {
|
||||
for doc in documents {
|
||||
if let Err(e) = update.remove_document(doc) {
|
||||
error!("Impossible to remove document; {:?}", e);
|
||||
} else {
|
||||
response.deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let stop_words = &db.stop_words;
|
||||
if let Some(documents) = body.insert {
|
||||
for doc in documents {
|
||||
if let Err(e) = update.update_document(doc, &tokenizer_builder, &stop_words) {
|
||||
error!("Impossible to update document; {:?}", e);
|
||||
} else {
|
||||
response.inserted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
let update = match update.build() {
|
||||
Ok(u) => u,
|
||||
Err(e) => {
|
||||
error!("Impossible to create an update file; {:?}", e);
|
||||
return Err(warp::reject::not_found())
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
let index = match db.get(index_name.clone()){
|
||||
Ok(i) => i,
|
||||
Err(_) => return Err(warp::reject::not_found()),
|
||||
};
|
||||
|
||||
if let Err(e) = index.ingest_update_file(update) {
|
||||
error!("Impossible to ingest sst file; {:?}", e);
|
||||
return Err(warp::reject::not_found())
|
||||
};
|
||||
}
|
||||
|
||||
if let Ok(response) = serde_json::to_string(&response) {
|
||||
return Ok(response);
|
||||
};
|
||||
|
||||
return Err(warp::reject::not_found())
|
||||
}
|
||||
|
||||
// Search in a specific index
|
||||
// The default limit is 20
|
||||
//
|
||||
// GET /:name/search
|
||||
// Params:
|
||||
// - query: String
|
||||
// - limit: Option<usize>
|
||||
fn search(index_name: String, query: SearchQuery, db: Arc<MultiDatabase>) -> Result<String, Rejection> {
|
||||
|
||||
let view = {
|
||||
let index = match db.get(index_name.clone()){
|
||||
Ok(i) => i,
|
||||
Err(_) => return Err(warp::reject::not_found()),
|
||||
};
|
||||
index.view()
|
||||
};
|
||||
|
||||
let limit = query.limit.unwrap_or(20);
|
||||
|
||||
let query_builder = match view.query_builder() {
|
||||
Ok(q) => q,
|
||||
Err(_err) => return Err(warp::reject::not_found()),
|
||||
};
|
||||
|
||||
let (time, responses) = measure_time(|| {
|
||||
let docs = query_builder.query(&query.q, 0..limit);
|
||||
let mut results: Vec<HashMap<String, String>> = Vec::with_capacity(limit);
|
||||
for doc in docs {
|
||||
match view.document_by_id(doc.id) {
|
||||
Ok(val) => results.push(val),
|
||||
Err(e) => println!("{:?}", e),
|
||||
}
|
||||
}
|
||||
results
|
||||
});
|
||||
|
||||
let response = match serde_json::to_string(&responses) {
|
||||
Ok(val) => val,
|
||||
Err(err) => format!("{:?}", err),
|
||||
};
|
||||
|
||||
info!("index: {} - search: {:?} - limit: {} - time: {}", index_name, query.q, limit, time);
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
fn start_server(listen_addr: SocketAddr, db: Arc<MultiDatabase>) {
|
||||
let index_path = warp::path("index").and(warp::path::param::<String>());
|
||||
let db = warp::any().map(move || db.clone());
|
||||
|
||||
let create_path = warp::path("create").and(warp::path::end());
|
||||
let ingest_path = index_path.and(warp::path("ingest")).and(warp::path::end());
|
||||
let search_path = index_path.and(warp::path("search")).and(warp::path::end());
|
||||
|
||||
let create = warp::post2()
|
||||
.and(create_path)
|
||||
.and(warp::body::json())
|
||||
.and(db.clone())
|
||||
.and_then(create);
|
||||
|
||||
let ingest = warp::put2()
|
||||
.and(ingest_path)
|
||||
.and(warp::body::json())
|
||||
.and(db.clone())
|
||||
.and_then(ingest);
|
||||
|
||||
let search = warp::get2()
|
||||
.and(search_path)
|
||||
.and(warp::query())
|
||||
.and(db.clone())
|
||||
.and_then(search);
|
||||
|
||||
let api = create
|
||||
.or(ingest)
|
||||
.or(search);
|
||||
|
||||
let logs = warp::log("server");
|
||||
let headers = warp::reply::with::header("Content-Type", "application/json");
|
||||
|
||||
let routes = api.with(logs).with(headers);
|
||||
|
||||
info!("Server is started on {}", listen_addr);
|
||||
warp::serve(routes).run(listen_addr);
|
||||
}
|
||||
|
||||
fn main() {
|
||||
env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let stop_words = match retrieve_stop_words(&opt.stop_words) {
|
||||
Ok(s) => s,
|
||||
Err(_) => HashSet::new(),
|
||||
};
|
||||
|
||||
let db = Arc::new(MultiDatabase::new(opt.database_path.clone(), stop_words));
|
||||
|
||||
db.load_existing();
|
||||
|
||||
start_server(opt.listen_addr, db);
|
||||
}
|
||||
|
||||
|
@ -1,11 +1,19 @@
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::iter::FromIterator;
|
||||
use std::io::{self, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use meilidb::database::schema::SchemaAttr;
|
||||
use meilidb::database::Database;
|
||||
use meilidb::Match;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Opt {
|
||||
@ -13,20 +21,87 @@ pub struct Opt {
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
/// Fields that must be displayed.
|
||||
pub displayed_fields: Vec<String>,
|
||||
|
||||
/// The number of returned results
|
||||
#[structopt(short = "n", long = "number-results", default_value = "10")]
|
||||
pub number_results: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Document {
|
||||
id: String,
|
||||
title: String,
|
||||
description: String,
|
||||
image: String,
|
||||
type Document = HashMap<String, String>;
|
||||
|
||||
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
||||
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
||||
let mut highlighted = false;
|
||||
|
||||
for range in ranges.windows(2) {
|
||||
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
|
||||
if highlighted {
|
||||
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
||||
}
|
||||
write!(&mut stdout, "{}", &text[start..end])?;
|
||||
stdout.reset()?;
|
||||
highlighted = !highlighted;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
|
||||
let mut byte_index = 0;
|
||||
let mut byte_length = 0;
|
||||
|
||||
for (n, (i, c)) in text.char_indices().enumerate() {
|
||||
if n == index {
|
||||
byte_index = i;
|
||||
}
|
||||
|
||||
if n + 1 == index + length {
|
||||
byte_length = i - byte_index + c.len_utf8();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
(byte_index, byte_length)
|
||||
}
|
||||
|
||||
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
|
||||
let mut byte_indexes = BTreeMap::new();
|
||||
|
||||
for match_ in matches {
|
||||
let match_attribute = match_.attribute.attribute();
|
||||
if SchemaAttr::new(match_attribute) == attribute {
|
||||
let word_area = match_.word_area;
|
||||
|
||||
let char_index = word_area.char_index() as usize;
|
||||
let char_length = word_area.length() as usize;
|
||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||
|
||||
match byte_indexes.entry(byte_index) {
|
||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||
Entry::Occupied(mut entry) => {
|
||||
if *entry.get() < byte_length {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut title_areas = Vec::new();
|
||||
title_areas.push(0);
|
||||
for (byte_index, length) in byte_indexes {
|
||||
title_areas.push(byte_index);
|
||||
title_areas.push(byte_index + length);
|
||||
}
|
||||
title_areas.push(text.len());
|
||||
title_areas.sort_unstable();
|
||||
title_areas
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
|
||||
@ -41,26 +116,53 @@ fn main() -> Result<(), Box<Error>> {
|
||||
io::stdout().flush()?;
|
||||
|
||||
if input.read_line(&mut buffer)? == 0 { break }
|
||||
let query = buffer.trim_end_matches('\n');
|
||||
|
||||
let view = database.view();
|
||||
let schema = view.schema();
|
||||
|
||||
let (elapsed, documents) = elapsed::measure_time(|| {
|
||||
let builder = view.query_builder().unwrap();
|
||||
builder.query(&buffer, 0..opt.number_results)
|
||||
builder.query(query, 0..opt.number_results)
|
||||
});
|
||||
|
||||
let mut full_documents = Vec::with_capacity(documents.len());
|
||||
let number_of_documents = documents.len();
|
||||
for doc in documents {
|
||||
match view.document_by_id::<Document>(doc.id) {
|
||||
Ok(document) => {
|
||||
for name in &opt.displayed_fields {
|
||||
let attr = match schema.attribute(name) {
|
||||
Some(attr) => attr,
|
||||
None => continue,
|
||||
};
|
||||
let text = match document.get(name) {
|
||||
Some(text) => text,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
for document in documents {
|
||||
match view.retrieve_document::<Document>(document.id) {
|
||||
Ok(document) => full_documents.push(document),
|
||||
print!("{}: ", name);
|
||||
let areas = create_highlight_areas(&text, &doc.matches, attr);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
}
|
||||
},
|
||||
Err(e) => eprintln!("{}", e),
|
||||
}
|
||||
|
||||
let mut matching_attributes = HashSet::new();
|
||||
for _match in doc.matches {
|
||||
let attr = SchemaAttr::new(_match.attribute.attribute());
|
||||
let name = schema.attribute_name(attr);
|
||||
matching_attributes.insert(name);
|
||||
}
|
||||
|
||||
let matching_attributes = Vec::from_iter(matching_attributes);
|
||||
println!("matching in: {:?}", matching_attributes);
|
||||
|
||||
println!();
|
||||
}
|
||||
|
||||
println!("{:#?}", full_documents);
|
||||
println!("Found {} results in {}", full_documents.len(), elapsed);
|
||||
|
||||
eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
|
19
examples/schema-example.toml
Normal file
19
examples/schema-example.toml
Normal file
@ -0,0 +1,19 @@
|
||||
# This schema has been generated ...
|
||||
# The order in which the attributes are declared is important,
|
||||
# it specify the attribute xxx...
|
||||
|
||||
identifier = "id"
|
||||
|
||||
[attributes.id]
|
||||
stored = true
|
||||
|
||||
[attributes.title]
|
||||
stored = true
|
||||
indexed = true
|
||||
|
||||
[attributes.description]
|
||||
stored = true
|
||||
indexed = true
|
||||
|
||||
[attributes.image]
|
||||
stored = true
|
@ -95,7 +95,8 @@ or
|
||||
other
|
||||
ought
|
||||
our
|
||||
ours ourselves
|
||||
ours
|
||||
ourselves
|
||||
out
|
||||
over
|
||||
own
|
||||
|
163
misc/fr.stopwords.txt
Normal file
163
misc/fr.stopwords.txt
Normal file
@ -0,0 +1,163 @@
|
||||
au
|
||||
aux
|
||||
avec
|
||||
ce
|
||||
ces
|
||||
dans
|
||||
de
|
||||
des
|
||||
du
|
||||
elle
|
||||
en
|
||||
et
|
||||
eux
|
||||
il
|
||||
je
|
||||
la
|
||||
le
|
||||
leur
|
||||
lui
|
||||
ma
|
||||
mais
|
||||
me
|
||||
même
|
||||
mes
|
||||
moi
|
||||
mon
|
||||
ne
|
||||
nos
|
||||
notre
|
||||
nous
|
||||
on
|
||||
ou
|
||||
par
|
||||
pas
|
||||
pour
|
||||
qu
|
||||
que
|
||||
qui
|
||||
sa
|
||||
se
|
||||
ses
|
||||
son
|
||||
sur
|
||||
ta
|
||||
te
|
||||
tes
|
||||
toi
|
||||
ton
|
||||
tu
|
||||
un
|
||||
une
|
||||
vos
|
||||
votre
|
||||
vous
|
||||
c
|
||||
d
|
||||
j
|
||||
l
|
||||
à
|
||||
m
|
||||
n
|
||||
s
|
||||
t
|
||||
y
|
||||
été
|
||||
étée
|
||||
étées
|
||||
étés
|
||||
étant
|
||||
suis
|
||||
es
|
||||
est
|
||||
sommes
|
||||
êtes
|
||||
sont
|
||||
serai
|
||||
seras
|
||||
sera
|
||||
serons
|
||||
serez
|
||||
seront
|
||||
serais
|
||||
serait
|
||||
serions
|
||||
seriez
|
||||
seraient
|
||||
étais
|
||||
était
|
||||
étions
|
||||
étiez
|
||||
étaient
|
||||
fus
|
||||
fut
|
||||
fûmes
|
||||
fûtes
|
||||
furent
|
||||
sois
|
||||
soit
|
||||
soyons
|
||||
soyez
|
||||
soient
|
||||
fusse
|
||||
fusses
|
||||
fût
|
||||
fussions
|
||||
fussiez
|
||||
fussent
|
||||
ayant
|
||||
eu
|
||||
eue
|
||||
eues
|
||||
eus
|
||||
ai
|
||||
as
|
||||
avons
|
||||
avez
|
||||
ont
|
||||
aurai
|
||||
auras
|
||||
aura
|
||||
aurons
|
||||
aurez
|
||||
auront
|
||||
aurais
|
||||
aurait
|
||||
aurions
|
||||
auriez
|
||||
auraient
|
||||
avais
|
||||
avait
|
||||
avions
|
||||
aviez
|
||||
avaient
|
||||
eut
|
||||
eûmes
|
||||
eûtes
|
||||
eurent
|
||||
aie
|
||||
aies
|
||||
ait
|
||||
ayons
|
||||
ayez
|
||||
aient
|
||||
eusse
|
||||
eusses
|
||||
eût
|
||||
eussions
|
||||
eussiez
|
||||
eussent
|
||||
ceci
|
||||
celà
|
||||
cet
|
||||
cette
|
||||
ici
|
||||
ils
|
||||
les
|
||||
leurs
|
||||
quel
|
||||
quels
|
||||
quelle
|
||||
quelles
|
||||
sans
|
||||
soi
|
105
src/attribute.rs
Normal file
105
src/attribute.rs
Normal file
@ -0,0 +1,105 @@
|
||||
use std::fmt;
|
||||
|
||||
/// Represent an attribute number along with the word index
|
||||
/// according to the tokenizer used.
|
||||
///
|
||||
/// It can accept up to 1024 attributes and word positions
|
||||
/// can be maximum 2^22.
|
||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Attribute(u32);
|
||||
|
||||
impl Attribute {
|
||||
/// Construct an `Attribute` from an attribute number and
|
||||
/// the word position of a match according to the tokenizer used.
|
||||
pub(crate) fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
|
||||
if attribute & 0b1111_1100_0000_0000 != 0 {
|
||||
return Err(AttributeError::AttributeTooBig)
|
||||
}
|
||||
|
||||
if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
||||
return Err(AttributeError::IndexTooBig)
|
||||
}
|
||||
|
||||
let attribute = u32::from(attribute) << 22;
|
||||
Ok(Attribute(attribute | index))
|
||||
}
|
||||
|
||||
/// Construct an `Attribute` from an attribute number and
|
||||
/// the word position of a match according to the tokenizer used.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The attribute must not be greater than 1024
|
||||
/// and the word index not greater than 2^22.
|
||||
pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute {
|
||||
match Attribute::new(attribute, index) {
|
||||
Ok(attribute) => attribute,
|
||||
Err(AttributeError::AttributeTooBig) => {
|
||||
panic!("attribute must not be greater than 1024")
|
||||
},
|
||||
Err(AttributeError::IndexTooBig) => {
|
||||
panic!("attribute word index must not be greater than 2^22")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn max_value() -> Attribute {
|
||||
Attribute(u32::max_value())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn attribute(self) -> u16 {
|
||||
(self.0 >> 22) as u16
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn word_index(self) -> u32 {
|
||||
self.0 & 0b0000_0000_0011_1111_1111_1111_1111
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Attribute {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("Attribute")
|
||||
.field("attribute", &self.attribute())
|
||||
.field("word_index", &self.word_index())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum AttributeError {
|
||||
AttributeTooBig,
|
||||
IndexTooBig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use quickcheck::{quickcheck, TestResult};
|
||||
|
||||
quickcheck! {
|
||||
fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
|
||||
if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let attribute = Attribute::new_faillible(gen_attr, gen_index);
|
||||
|
||||
let valid_attribute = attribute.attribute() == gen_attr;
|
||||
let valid_index = attribute.word_index() == gen_index;
|
||||
|
||||
TestResult::from_bool(valid_attribute && valid_index)
|
||||
}
|
||||
|
||||
fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
|
||||
if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let a = Attribute::new_faillible(gen_attr, gen_index);
|
||||
let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
|
||||
|
||||
TestResult::from_bool(a < b)
|
||||
}
|
||||
}
|
||||
}
|
@ -50,6 +50,7 @@ impl AutomatonExt for DfaExt {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum PrefixSetting {
|
||||
Prefix,
|
||||
NoPrefix,
|
||||
|
@ -1,59 +1,54 @@
|
||||
use std::io::{self, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::{io, mem};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct DocIds {
|
||||
data: Data,
|
||||
}
|
||||
pub struct DocIds(SharedData);
|
||||
|
||||
impl DocIds {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
let data = Data::Mmap(mmap);
|
||||
Ok(DocIds { data })
|
||||
pub fn new(ids: &Set<DocumentId>) -> DocIds {
|
||||
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
|
||||
let data = SharedData::from_bytes(bytes.to_vec());
|
||||
DocIds(data)
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
// FIXME check if modulo DocumentId
|
||||
let len = vec.len();
|
||||
let data = Data::Shared {
|
||||
bytes: Arc::new(vec),
|
||||
offset: 0,
|
||||
len: len
|
||||
};
|
||||
Ok(DocIds { data })
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let doc_ids = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIds(doc_ids))
|
||||
}
|
||||
|
||||
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
|
||||
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
|
||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||
// FIXME prefer using the sdset::exponential_search function
|
||||
self.doc_ids().binary_search(&doc).is_ok()
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn doc_ids(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.data;
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for DocIds {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.0;
|
||||
let ptr = slice.as_ptr() as *const DocumentId;
|
||||
let len = slice.len() / mem::size_of::<DocumentId>();
|
||||
let len = slice.len() / size_of::<DocumentId>();
|
||||
let slice = unsafe { from_raw_parts(ptr, len) };
|
||||
Set::new_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for DocIds {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
self.data.as_ref().serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
@ -1,16 +1,15 @@
|
||||
use std::io::{self, Write, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::io::{self, Write};
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::raw::MmapReadOnly;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[repr(C)]
|
||||
@ -21,52 +20,45 @@ struct Range {
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct DocIndexes {
|
||||
ranges: Data,
|
||||
indexes: Data,
|
||||
ranges: SharedData,
|
||||
indexes: SharedData,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
DocIndexes::from_data(Data::Mmap(mmap))
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
|
||||
let bytes = Arc::new(bytes);
|
||||
let len = bytes.len();
|
||||
let data = SharedData::new(bytes, 0, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
DocIndexes::from_cursor(&mut cursor)
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let len = vec.len();
|
||||
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
|
||||
}
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let ranges = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
|
||||
let data = Data::Shared { bytes, offset, len };
|
||||
DocIndexes::from_data(data)
|
||||
}
|
||||
|
||||
fn from_data(data: Data) -> io::Result<Self> {
|
||||
let ranges_len_offset = data.len() - size_of::<u64>();
|
||||
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||
let ranges_len = ranges_len as usize;
|
||||
|
||||
let ranges_offset = ranges_len_offset - ranges_len;
|
||||
let ranges = data.range(ranges_offset, ranges_len);
|
||||
|
||||
let indexes = data.range(0, ranges_offset);
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let indexes = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn to_vec(&self) -> Vec<u8> {
|
||||
let capacity = self.indexes.len() + self.ranges.len() + size_of::<u64>();
|
||||
let mut bytes = Vec::with_capacity(capacity);
|
||||
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
bytes.write_u64::<LittleEndian>(self.ranges.len() as u64).unwrap();
|
||||
|
||||
bytes
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||
self.ranges().get(index as usize).map(|Range { start, end }| {
|
||||
self.ranges().get(index).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
let end = *end as usize;
|
||||
let slice = &self.indexes()[start..end];
|
||||
@ -102,12 +94,17 @@ impl Index<usize> for DocIndexes {
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
indexes: Vec<DocIndex>,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl DocIndexesBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
DocIndexesBuilder::new(Vec::new())
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -115,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
|
||||
let len = indexes.len() as u64;
|
||||
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
||||
let range = Range { start, end: start + len };
|
||||
self.ranges.push(range);
|
||||
|
||||
// write the values
|
||||
let indexes = unsafe { into_u8_slice(indexes) };
|
||||
self.wtr.write_all(indexes)
|
||||
self.indexes.extend_from_slice(indexes);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
@ -135,40 +131,52 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
// write the ranges
|
||||
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
// write the length of the ranges
|
||||
let ranges = unsafe { into_u8_slice(&self.ranges) };
|
||||
let len = ranges.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
let indexes = unsafe { into_u8_slice(&self.indexes) };
|
||||
let len = indexes.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(indexes)?;
|
||||
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::error::Error;
|
||||
use crate::{Attribute, WordArea};
|
||||
|
||||
use crate::DocumentId;
|
||||
|
||||
#[test]
|
||||
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: Attribute::new_faillible(3, 11),
|
||||
word_area: WordArea::new_faillible(30, 4)
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: Attribute::new_faillible(4, 21),
|
||||
word_area: WordArea::new_faillible(35, 6)
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: Attribute::new_faillible(8, 2),
|
||||
word_area: WordArea::new_faillible(89, 6)
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(bytes)?;
|
||||
@ -183,19 +191,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: Attribute::new_faillible(3, 11),
|
||||
word_area: WordArea::new_faillible(30, 4)
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: Attribute::new_faillible(4, 21),
|
||||
word_area: WordArea::new_faillible(35, 6)
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: Attribute::new_faillible(8, 2),
|
||||
word_area: WordArea::new_faillible(89, 6)
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let builder_bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
|
||||
let bytes = docs.to_vec();
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
docs.write_to_bytes(&mut bytes);
|
||||
|
||||
assert_eq!(builder_bytes, bytes);
|
||||
|
||||
|
@ -1,51 +1,43 @@
|
||||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::raw::MmapReadOnly;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Data {
|
||||
Shared {
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
},
|
||||
Mmap(MmapReadOnly),
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl Data {
|
||||
pub fn range(&self, off: usize, l: usize) -> Data {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
assert!(off + l <= *len);
|
||||
Data::Shared {
|
||||
bytes: bytes.clone(),
|
||||
offset: offset + off,
|
||||
len: l,
|
||||
}
|
||||
},
|
||||
Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)),
|
||||
impl SharedData {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::new(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Data {
|
||||
fn default() -> Data {
|
||||
Data::Shared {
|
||||
bytes: Arc::default(),
|
||||
offset: 0,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Data {
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@ -53,13 +45,14 @@ impl Deref for Data {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for Data {
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
&bytes[*offset..offset + len]
|
||||
},
|
||||
Data::Mmap(m) => m.as_slice(),
|
||||
}
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
@ -1,110 +0,0 @@
|
||||
mod ops;
|
||||
pub mod positive;
|
||||
pub mod negative;
|
||||
|
||||
pub use self::positive::{PositiveBlob, PositiveBlobBuilder};
|
||||
pub use self::negative::NegativeBlob;
|
||||
pub use self::ops::OpBuilder;
|
||||
|
||||
use std::fmt;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Blob {
|
||||
Positive(PositiveBlob),
|
||||
Negative(NegativeBlob),
|
||||
}
|
||||
|
||||
impl Blob {
|
||||
pub fn is_negative(&self) -> bool {
|
||||
self.sign() == Sign::Negative
|
||||
}
|
||||
|
||||
pub fn is_positive(&self) -> bool {
|
||||
self.sign() == Sign::Positive
|
||||
}
|
||||
|
||||
pub fn sign(&self) -> Sign {
|
||||
match self {
|
||||
Blob::Positive(_) => Sign::Positive,
|
||||
Blob::Negative(_) => Sign::Negative,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Blob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
match self {
|
||||
Blob::Positive(blob) => {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&Sign::Positive)?;
|
||||
tuple.serialize_element(&blob)?;
|
||||
tuple.end()
|
||||
},
|
||||
Blob::Negative(blob) => {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&Sign::Negative)?;
|
||||
tuple.serialize_element(&blob)?;
|
||||
tuple.end()
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Blob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Blob, D::Error> {
|
||||
struct TupleVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for TupleVisitor {
|
||||
type Value = Blob;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a Blob struct")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
|
||||
let sign = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(0, &self)),
|
||||
};
|
||||
match sign {
|
||||
Sign::Positive => {
|
||||
let blob = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
Ok(Blob::Positive(blob))
|
||||
},
|
||||
Sign::Negative => {
|
||||
let blob = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
Ok(Blob::Negative(blob))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_tuple(2, TupleVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum Sign {
|
||||
Positive,
|
||||
Negative,
|
||||
}
|
||||
|
||||
impl Sign {
|
||||
pub fn invert(self) -> Sign {
|
||||
match self {
|
||||
Sign::Positive => Sign::Negative,
|
||||
Sign::Negative => Sign::Positive,
|
||||
}
|
||||
}
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::fmt;
|
||||
|
||||
use sdset::Set;
|
||||
use serde::de::{self, Deserialize, Deserializer};
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct NegativeBlob {
|
||||
doc_ids: DocIds,
|
||||
}
|
||||
|
||||
impl NegativeBlob {
|
||||
pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
{
|
||||
let doc_ids = DocIds::from_path(doc_ids)?;
|
||||
Ok(NegativeBlob { doc_ids })
|
||||
}
|
||||
|
||||
pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let doc_ids = DocIds::from_bytes(doc_ids)?;
|
||||
Ok(NegativeBlob { doc_ids })
|
||||
}
|
||||
|
||||
pub fn from_raw(doc_ids: DocIds) -> Self {
|
||||
NegativeBlob { doc_ids }
|
||||
}
|
||||
|
||||
pub fn as_ids(&self) -> &DocIds {
|
||||
&self.doc_ids
|
||||
}
|
||||
|
||||
pub fn into_doc_ids(self) -> DocIds {
|
||||
self.doc_ids
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for NegativeBlob {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
self.as_ids().doc_ids()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for NegativeBlob {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "NegativeBlob(")?;
|
||||
f.debug_list().entries(self.as_ref().as_slice()).finish()?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for NegativeBlob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
self.doc_ids.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for NegativeBlob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<NegativeBlob, D::Error> {
|
||||
let bytes = Vec::deserialize(deserializer)?;
|
||||
NegativeBlob::from_bytes(bytes).map_err(de::Error::custom)
|
||||
}
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
mod blob;
|
||||
mod ops;
|
||||
|
||||
pub use self::blob::NegativeBlob;
|
||||
pub use self::ops::OpBuilder;
|
@ -1,73 +0,0 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::blob::NegativeBlob;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct OpBuilder<'a> {
|
||||
inner: SdOpBuilder<'a, DocumentId>,
|
||||
}
|
||||
|
||||
/// Do a set operation on multiple negative blobs.
|
||||
impl<'a> OpBuilder<'a> {
|
||||
pub fn new() -> Self {
|
||||
Self { inner: SdOpBuilder::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> Self {
|
||||
Self { inner: SdOpBuilder::with_capacity(cap) }
|
||||
}
|
||||
|
||||
pub fn add(mut self, blob: &'a NegativeBlob) -> Self {
|
||||
self.push(blob);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: &'a NegativeBlob) {
|
||||
let set = Set::new_unchecked(blob.as_ref());
|
||||
self.inner.push(set);
|
||||
}
|
||||
|
||||
pub fn union(self) -> Union<'a> {
|
||||
Union::new(self.inner.union())
|
||||
}
|
||||
|
||||
pub fn intersection(self) -> Intersection<'a> {
|
||||
Intersection::new(self.inner.intersection())
|
||||
}
|
||||
|
||||
pub fn difference(self) -> Difference<'a> {
|
||||
Difference::new(self.inner.difference())
|
||||
}
|
||||
|
||||
pub fn symmetric_difference(self) -> SymmetricDifference<'a> {
|
||||
SymmetricDifference::new(self.inner.symmetric_difference())
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! logical_operation {
|
||||
(struct $name:ident, $operation:ident) => {
|
||||
|
||||
pub struct $name<'a> {
|
||||
op: sdset::multi::$name<'a, DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> $name<'a> {
|
||||
fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self {
|
||||
$name { op }
|
||||
}
|
||||
|
||||
pub fn into_negative_blob(self) -> NegativeBlob {
|
||||
let document_ids = sdset::SetOperation::into_set_buf(self.op);
|
||||
let doc_ids = DocIds::from_document_ids(document_ids.into_vec());
|
||||
NegativeBlob::from_raw(doc_ids)
|
||||
}
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
logical_operation!(struct Union, union);
|
||||
logical_operation!(struct Intersection, intersection);
|
||||
logical_operation!(struct Difference, difference);
|
||||
logical_operation!(struct SymmetricDifference, symmetric_difference);
|
@ -1,109 +0,0 @@
|
||||
use std::error::Error;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use group_by::GroupBy;
|
||||
|
||||
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
|
||||
use crate::database::blob::{positive, negative};
|
||||
|
||||
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
|
||||
a.sign() == b.sign()
|
||||
}
|
||||
|
||||
fn unwrap_positive(blob: &Blob) -> &PositiveBlob {
|
||||
match blob {
|
||||
Blob::Positive(blob) => blob,
|
||||
Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"),
|
||||
}
|
||||
}
|
||||
|
||||
fn unwrap_negative(blob: &Blob) -> &NegativeBlob {
|
||||
match blob {
|
||||
Blob::Negative(blob) => blob,
|
||||
Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OpBuilder {
|
||||
blobs: Vec<Blob>,
|
||||
}
|
||||
|
||||
impl OpBuilder {
|
||||
pub fn new() -> OpBuilder {
|
||||
OpBuilder { blobs: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> OpBuilder {
|
||||
OpBuilder { blobs: Vec::with_capacity(cap) }
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: Blob) {
|
||||
if self.blobs.is_empty() && blob.is_negative() { return }
|
||||
self.blobs.push(blob);
|
||||
}
|
||||
|
||||
pub fn merge(self) -> Result<PositiveBlob, Box<Error>> {
|
||||
let groups = GroupBy::new(&self.blobs, blob_same_sign);
|
||||
let mut aggregated = Vec::new();
|
||||
|
||||
for blobs in groups {
|
||||
match blobs[0].sign() {
|
||||
Sign::Positive => {
|
||||
let mut op_builder = positive::OpBuilder::with_capacity(blobs.len());
|
||||
for blob in blobs {
|
||||
op_builder.push(unwrap_positive(blob));
|
||||
}
|
||||
|
||||
let mut stream = op_builder.union().into_stream();
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
while let Some((input, doc_indexes)) = stream.next() {
|
||||
// FIXME empty doc_indexes must be handled by OpBuilder
|
||||
if !doc_indexes.is_empty() {
|
||||
builder.insert(input, doc_indexes).unwrap();
|
||||
}
|
||||
}
|
||||
let (map, doc_indexes) = builder.into_inner().unwrap();
|
||||
let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap();
|
||||
aggregated.push(Blob::Positive(blob));
|
||||
},
|
||||
Sign::Negative => {
|
||||
let mut op_builder = negative::OpBuilder::with_capacity(blobs.len());
|
||||
for blob in blobs {
|
||||
op_builder.push(unwrap_negative(blob));
|
||||
}
|
||||
let blob = op_builder.union().into_negative_blob();
|
||||
aggregated.push(Blob::Negative(blob));
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| {
|
||||
let negative = NegativeBlob::default();
|
||||
let (positive, negative) = match slice {
|
||||
[a, b] => (unwrap_positive(a), unwrap_negative(b)),
|
||||
[a] => (unwrap_positive(a), &negative),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
let op_builder = positive::OpBuilder::new().add(&base).add(&positive);
|
||||
let mut stream = op_builder.union().into_stream();
|
||||
while let Some((input, doc_indexes)) = stream.next() {
|
||||
let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x);
|
||||
|
||||
buffer.clear();
|
||||
op.extend_vec(&mut buffer);
|
||||
if !buffer.is_empty() {
|
||||
builder.insert(input, Set::new_unchecked(&buffer))?;
|
||||
}
|
||||
}
|
||||
|
||||
let (map, doc_indexes) = builder.into_inner()?;
|
||||
PositiveBlob::from_bytes(map, doc_indexes)
|
||||
})
|
||||
}
|
||||
}
|
@ -1,254 +0,0 @@
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::error::Error;
|
||||
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct PositiveBlob {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl PositiveBlob {
|
||||
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
Q: AsRef<Path>,
|
||||
{
|
||||
let map = Map::from_path(map)?;
|
||||
let indexes = DocIndexes::from_path(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
|
||||
PositiveBlob { map, indexes }
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
|
||||
self.map.get(key).map(|index| &self.indexes[index as usize])
|
||||
}
|
||||
|
||||
pub fn as_map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn as_indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn explode(self) -> (Map, DocIndexes) {
|
||||
(self.map, self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PositiveBlob {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "PositiveBlob([")?;
|
||||
let mut stream = self.into_stream();
|
||||
let mut first = true;
|
||||
while let Some((k, v)) = stream.next() {
|
||||
if !first {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
first = false;
|
||||
write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?;
|
||||
}
|
||||
write!(f, "])")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob {
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
/// The type of the stream to be constructed.
|
||||
type Into = PositiveBlobStream<'m>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into {
|
||||
PositiveBlobStream {
|
||||
map_stream: self.map.into_stream(),
|
||||
doc_indexes: &self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBlobStream<'m> {
|
||||
map_stream: map::Stream<'m>,
|
||||
doc_indexes: &'m DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> {
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.map_stream.next() {
|
||||
Some((input, index)) => {
|
||||
let doc_indexes = &self.doc_indexes[index as usize];
|
||||
Some((input, doc_indexes))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for PositiveBlob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&self.map.as_fst().to_vec())?;
|
||||
tuple.serialize_element(&self.indexes.to_vec())?;
|
||||
tuple.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for PositiveBlob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<PositiveBlob, D::Error> {
|
||||
struct TupleVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for TupleVisitor {
|
||||
type Value = PositiveBlob;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a PositiveBlob struct")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
|
||||
let map = match seq.next_element()? {
|
||||
Some(bytes) => match Map::from_bytes(bytes) {
|
||||
Ok(value) => value,
|
||||
Err(err) => return Err(de::Error::custom(err)),
|
||||
},
|
||||
None => return Err(de::Error::invalid_length(0, &self)),
|
||||
};
|
||||
|
||||
let indexes = match seq.next_element()? {
|
||||
Some(bytes) => match DocIndexes::from_bytes(bytes) {
|
||||
Ok(value) => value,
|
||||
Err(err) => return Err(de::Error::custom(err)),
|
||||
},
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_tuple(2, TupleVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBlobBuilder<W, X> {
|
||||
map: fst::MapBuilder<W>,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
PositiveBlobBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
|
||||
pub fn new(map: W, indexes: X) -> Result<Self, Box<Error>> {
|
||||
Ok(PositiveBlobBuilder {
|
||||
map: fst::MapBuilder::new(map)?,
|
||||
indexes: DocIndexesBuilder::new(indexes),
|
||||
value: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, doc_indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(doc_indexes)?;
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
builder.insert("aaa", Set::new(&[a])?)?;
|
||||
builder.insert("aab", Set::new(&[a, b, c])?)?;
|
||||
builder.insert("aac", Set::new(&[a, c])?)?;
|
||||
|
||||
let (map_bytes, indexes_bytes) = builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
|
||||
|
||||
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
|
||||
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
|
||||
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
|
||||
assert_eq!(positive_blob.get("aad"), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
builder.insert("aaa", Set::new(&[a])?)?;
|
||||
builder.insert("aab", Set::new(&[a, b, c])?)?;
|
||||
builder.insert("aac", Set::new(&[a, c])?)?;
|
||||
|
||||
let (map_bytes, indexes_bytes) = builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
|
||||
|
||||
let bytes = bincode::serialize(&positive_blob)?;
|
||||
let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?;
|
||||
|
||||
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
|
||||
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
|
||||
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
|
||||
assert_eq!(positive_blob.get("aad"), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
mod blob;
|
||||
mod ops;
|
||||
|
||||
pub use self::blob::{PositiveBlob, PositiveBlobBuilder};
|
||||
pub use self::ops::OpBuilder;
|
@ -1,128 +0,0 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::{SetOperation, Set};
|
||||
|
||||
use crate::database::blob::PositiveBlob;
|
||||
use crate::data::DocIndexes;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct OpBuilder<'m> {
|
||||
// the operation on the maps is always an union.
|
||||
map_op: fst::map::OpBuilder<'m>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
}
|
||||
|
||||
/// Do a set operation on multiple positive blobs.
|
||||
impl<'m> OpBuilder<'m> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
map_op: fst::map::OpBuilder::new(),
|
||||
indexes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> Self {
|
||||
Self {
|
||||
map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity
|
||||
indexes: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(mut self, blob: &'m PositiveBlob) -> Self {
|
||||
self.push(blob);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: &'m PositiveBlob) {
|
||||
self.map_op.push(blob.as_map());
|
||||
self.indexes.push(blob.as_indexes());
|
||||
}
|
||||
|
||||
pub fn union(self) -> Union<'m> {
|
||||
Union::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn intersection(self) -> Intersection<'m> {
|
||||
Intersection::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn difference(self) -> Difference<'m> {
|
||||
Difference::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
|
||||
SymmetricDifference::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! logical_operation {
|
||||
(struct $name:ident, $operation:ident) => {
|
||||
|
||||
pub struct $name<'m> {
|
||||
stream: fst::map::Union<'m>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
outs: Vec<DocIndex>,
|
||||
}
|
||||
|
||||
impl<'m> $name<'m> {
|
||||
fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self {
|
||||
$name {
|
||||
stream: stream,
|
||||
indexes: indexes,
|
||||
outs: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
// loop {
|
||||
// let (input, ivalues) = match self.stream.next() {
|
||||
// Some(value) => value,
|
||||
// None => return None,
|
||||
// };
|
||||
|
||||
// self.outs.clear();
|
||||
|
||||
// let mut builder = SdOpBuilder::with_capacity(ivalues.len());
|
||||
// for ivalue in ivalues {
|
||||
// let indexes = self.indexes[ivalue.index];
|
||||
// let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes");
|
||||
// let set = Set::new_unchecked(indexes);
|
||||
// builder.push(set);
|
||||
// }
|
||||
|
||||
// builder.$operation().extend_vec(&mut self.outs);
|
||||
|
||||
// if self.outs.is_empty() { continue }
|
||||
// return Some((input, &self.outs))
|
||||
// }
|
||||
|
||||
// FIXME make the above code compile
|
||||
match self.stream.next() {
|
||||
Some((input, ivalues)) => {
|
||||
self.outs.clear();
|
||||
|
||||
let mut builder = SdOpBuilder::with_capacity(ivalues.len());
|
||||
for ivalue in ivalues {
|
||||
let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize];
|
||||
let set = Set::new_unchecked(doc_indexes);
|
||||
builder.push(set);
|
||||
}
|
||||
|
||||
builder.$operation().extend_vec(&mut self.outs);
|
||||
|
||||
if self.outs.is_empty() { return None }
|
||||
return Some((input, Set::new_unchecked(&self.outs)))
|
||||
},
|
||||
None => None
|
||||
}
|
||||
}
|
||||
}
|
||||
}}
|
||||
|
||||
logical_operation!(struct Union, union);
|
||||
logical_operation!(struct Intersection, intersection);
|
||||
logical_operation!(struct Difference, difference);
|
||||
logical_operation!(struct SymmetricDifference, symmetric_difference);
|
@ -2,13 +2,13 @@ use std::io::{Cursor, Read, Write};
|
||||
use std::mem::size_of;
|
||||
use std::fmt;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt};
|
||||
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
|
||||
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::DocumentId;
|
||||
|
||||
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
|
||||
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u32>();
|
||||
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentKey([u8; DOC_KEY_LEN]);
|
||||
@ -19,7 +19,7 @@ impl DocumentKey {
|
||||
|
||||
let mut wtr = Cursor::new(&mut buffer[..]);
|
||||
wtr.write_all(b"doc-").unwrap();
|
||||
wtr.write_u64::<NativeEndian>(id).unwrap();
|
||||
wtr.write_u64::<BigEndian>(id.0).unwrap();
|
||||
|
||||
DocumentKey(buffer)
|
||||
}
|
||||
@ -43,7 +43,8 @@ impl DocumentKey {
|
||||
}
|
||||
|
||||
pub fn document_id(&self) -> DocumentId {
|
||||
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
|
||||
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
|
||||
DocumentId(id)
|
||||
}
|
||||
}
|
||||
|
||||
@ -72,11 +73,19 @@ impl DocumentKeyAttr {
|
||||
let mut wtr = Cursor::new(&mut buffer[..]);
|
||||
wtr.write_all(&raw_key).unwrap();
|
||||
wtr.write_all(b"-").unwrap();
|
||||
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap();
|
||||
wtr.write_u16::<BigEndian>(attr.0).unwrap();
|
||||
|
||||
DocumentKeyAttr(buffer)
|
||||
}
|
||||
|
||||
pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(id, SchemaAttr::min())
|
||||
}
|
||||
|
||||
pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(id, SchemaAttr::max())
|
||||
}
|
||||
|
||||
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
|
||||
assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
|
||||
assert_eq!(&bytes[..4], b"doc-");
|
||||
@ -88,12 +97,13 @@ impl DocumentKeyAttr {
|
||||
}
|
||||
|
||||
pub fn document_id(&self) -> DocumentId {
|
||||
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
|
||||
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
|
||||
DocumentId(id)
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> SchemaAttr {
|
||||
let offset = 4 + size_of::<u64>() + 1;
|
||||
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap();
|
||||
let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
|
||||
SchemaAttr::new(value)
|
||||
}
|
||||
|
||||
@ -112,7 +122,24 @@ impl fmt::Debug for DocumentKeyAttr {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("DocumentKeyAttr")
|
||||
.field("document_id", &self.document_id())
|
||||
.field("attribute", &self.attribute().as_u32())
|
||||
.field("attribute", &self.attribute().0)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn keep_as_ref_order() {
|
||||
for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
|
||||
let id = DocumentId(0);
|
||||
let a = DocumentKeyAttr::new(id, SchemaAttr(a));
|
||||
let b = DocumentKeyAttr::new(id, SchemaAttr(b));
|
||||
|
||||
assert!(a < b);
|
||||
assert!(a.as_ref() < b.as_ref());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
82
src/database/index/mod.rs
Normal file
82
src/database/index/mod.rs
Normal file
@ -0,0 +1,82 @@
|
||||
mod negative;
|
||||
mod positive;
|
||||
|
||||
pub(crate) use self::negative::Negative;
|
||||
pub(crate) use self::positive::{Positive, PositiveBuilder};
|
||||
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use fst::Map;
|
||||
|
||||
use crate::data::{SharedData, DocIndexes};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Index {
|
||||
pub(crate) negative: Negative,
|
||||
pub(crate) positive: Positive,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
|
||||
let len = bytes.len();
|
||||
Index::from_shared_bytes(Arc::new(bytes), 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<Index, Box<Error>>
|
||||
{
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
|
||||
let negative = Negative::from_cursor(&mut cursor)?;
|
||||
let positive = Positive::from_cursor(&mut cursor)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
self.negative.write_to_bytes(bytes);
|
||||
self.positive.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
|
||||
if other.negative.is_empty() {
|
||||
let negative = Negative::default();
|
||||
let positive = self.positive.union(&other.positive)?;
|
||||
return Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = self.positive.into_stream();
|
||||
while let Some((key, indexes)) = stream.next() {
|
||||
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
|
||||
|
||||
buffer.clear();
|
||||
op.extend_vec(&mut buffer);
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
}
|
||||
|
||||
let positive = {
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let negative = Negative::default();
|
||||
let positive = positive.union(&other.positive)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
}
|
43
src/database/index/negative.rs
Normal file
43
src/database/index/negative.rs
Normal file
@ -0,0 +1,43 @@
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::ops::Deref;
|
||||
|
||||
use sdset::Set;
|
||||
use byteorder::{LittleEndian, WriteBytesExt};
|
||||
|
||||
use crate::data::SharedData;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Negative(DocIds);
|
||||
|
||||
impl Negative {
|
||||
pub fn new(doc_ids: DocIds) -> Negative {
|
||||
Negative(doc_ids)
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
|
||||
let doc_ids = DocIds::from_cursor(cursor)?;
|
||||
Ok(Negative(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.0.as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Negative {
|
||||
type Target = Set<DocumentId>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
166
src/database/index/positive.rs
Normal file
166
src/database/index/positive.rs
Normal file
@ -0,0 +1,166 @@
|
||||
use std::io::{Write, BufRead, Cursor};
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
use sdset::{Set, SetOperation};
|
||||
use sdset::duo::Union;
|
||||
use fst::raw::Fst;
|
||||
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Positive {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl Positive {
|
||||
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
|
||||
Positive { map, indexes }
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let data = cursor.get_ref().range(offset, len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
cursor.consume(len);
|
||||
|
||||
let indexes = DocIndexes::from_cursor(cursor)?;
|
||||
|
||||
Ok(Positive { map, indexes})
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
while let Some((key, ivalues)) = stream.next() {
|
||||
buffer.clear();
|
||||
match ivalues {
|
||||
[a, b] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
let a = Set::new_unchecked(indexes);
|
||||
|
||||
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
|
||||
let b = Set::new_unchecked(indexes);
|
||||
|
||||
let op = Union::new(a, b);
|
||||
op.extend_vec(&mut buffer);
|
||||
},
|
||||
[a] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
buffer.extend_from_slice(indexes)
|
||||
},
|
||||
_ => continue,
|
||||
}
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(Positive { map, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
/// The type of the stream to be constructed.
|
||||
type Into = Stream<'m>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into {
|
||||
Stream {
|
||||
map_stream: self.map.into_stream(),
|
||||
indexes: &self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Stream<'m> {
|
||||
map_stream: map::Stream<'m>,
|
||||
indexes: &'m DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'a> Streamer<'a> for Stream<'m> {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.map_stream.next() {
|
||||
Some((input, index)) => {
|
||||
let indexes = &self.indexes[index as usize];
|
||||
let indexes = Set::new_unchecked(indexes);
|
||||
Some((input, indexes))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBuilder<W, X> {
|
||||
map: fst::MapBuilder<W>,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
PositiveBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(indexes);
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
@ -1,45 +1,81 @@
|
||||
use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
|
||||
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
|
||||
use rocksdb::rocksdb::{Writable, Snapshot};
|
||||
use rocksdb::{DB, DBVector, MergeOperands};
|
||||
use crossbeam::atomic::ArcCell;
|
||||
use log::info;
|
||||
|
||||
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
pub use self::database_view::{DatabaseView, DocumentIter};
|
||||
use self::blob::positive::PositiveBlob;
|
||||
use self::update::Update;
|
||||
use self::schema::Schema;
|
||||
use self::blob::Blob;
|
||||
|
||||
pub mod blob;
|
||||
pub mod schema;
|
||||
pub mod update;
|
||||
mod document_key;
|
||||
mod database_view;
|
||||
mod deserializer;
|
||||
pub use self::view::{DatabaseView, DocumentIter};
|
||||
pub use self::update::{Update, UpdateBuilder};
|
||||
pub use self::serde::SerializerError;
|
||||
pub use self::schema::Schema;
|
||||
pub use self::index::Index;
|
||||
|
||||
const DATA_INDEX: &[u8] = b"data-index";
|
||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||
|
||||
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
|
||||
pub mod schema;
|
||||
pub(crate) mod index;
|
||||
mod deserializer;
|
||||
mod document_key;
|
||||
mod serde;
|
||||
mod update;
|
||||
mod view;
|
||||
|
||||
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
match snapshot.get(DATA_SCHEMA)? {
|
||||
Some(vector) => Ok(Schema::read_from(&*vector)?),
|
||||
Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
|
||||
None => Err(String::from("BUG: no schema found in the database").into()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
|
||||
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
match snapshot.get(DATA_INDEX)? {
|
||||
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||
None => Ok(PositiveBlob::default()),
|
||||
let (elapsed, vector) = elapsed::measure_time(|| snapshot.get(DATA_INDEX));
|
||||
info!("loading index from kv-store took {}", elapsed);
|
||||
|
||||
let index = match vector? {
|
||||
Some(vector) => {
|
||||
let bytes = vector.as_ref().to_vec();
|
||||
info!("index size if {} MiB", bytes.len() / 1024 / 1024);
|
||||
|
||||
let (elapsed, index) = elapsed::measure_time(|| Index::from_bytes(bytes));
|
||||
info!("loading index from bytes took {}", elapsed);
|
||||
index?
|
||||
|
||||
},
|
||||
None => Index::default(),
|
||||
};
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
|
||||
|
||||
let mut index: Option<Index> = None;
|
||||
for bytes in existing.into_iter().chain(operands) {
|
||||
let operand = Index::from_bytes(bytes.to_vec()).unwrap();
|
||||
let merged = match index {
|
||||
Some(ref index) => index.merge(&operand).unwrap(),
|
||||
None => operand,
|
||||
};
|
||||
|
||||
index.replace(merged);
|
||||
}
|
||||
|
||||
let index = index.unwrap_or_default();
|
||||
let mut bytes = Vec::new();
|
||||
index.write_to_bytes(&mut bytes);
|
||||
bytes
|
||||
}
|
||||
|
||||
pub struct Database {
|
||||
@ -49,11 +85,11 @@ pub struct Database {
|
||||
db: Mutex<Arc<DB>>,
|
||||
|
||||
// This view is updated each time the DB ingests an update
|
||||
view: RwLock<DatabaseView<Arc<DB>>>,
|
||||
view: ArcCell<DatabaseView<Arc<DB>>>,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Database, Box<Error>> {
|
||||
pub fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<Database, Box<Error>> {
|
||||
let path = path.as_ref();
|
||||
if path.exists() {
|
||||
return Err(format!("File already exists at path: {}, cannot create database.",
|
||||
@ -71,12 +107,12 @@ impl Database {
|
||||
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||
|
||||
let mut schema_bytes = Vec::new();
|
||||
schema.write_to(&mut schema_bytes)?;
|
||||
schema.write_to_bin(&mut schema_bytes)?;
|
||||
db.put(DATA_SCHEMA, &schema_bytes)?;
|
||||
|
||||
let db = Arc::new(db);
|
||||
let snapshot = Snapshot::new(db.clone());
|
||||
let view = RwLock::new(DatabaseView::new(snapshot)?);
|
||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
|
||||
Ok(Database { db: Mutex::new(db), view })
|
||||
}
|
||||
@ -94,18 +130,18 @@ impl Database {
|
||||
|
||||
// FIXME create a generic function to do that !
|
||||
let _schema = match db.get(DATA_SCHEMA)? {
|
||||
Some(value) => Schema::read_from(&*value)?,
|
||||
Some(value) => Schema::read_from_bin(&*value)?,
|
||||
None => return Err(String::from("Database does not contain a schema").into()),
|
||||
};
|
||||
|
||||
let db = Arc::new(db);
|
||||
let snapshot = Snapshot::new(db.clone());
|
||||
let view = RwLock::new(DatabaseView::new(snapshot)?);
|
||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
|
||||
Ok(Database { db: Mutex::new(db), view })
|
||||
}
|
||||
|
||||
pub fn ingest_update_file(&self, update: Update) -> Result<(), Box<Error>> {
|
||||
pub fn ingest_update_file(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let snapshot = {
|
||||
// We must have a mutex here to ensure that update ingestions and compactions
|
||||
// are done atomatically and in the right order.
|
||||
@ -116,32 +152,24 @@ impl Database {
|
||||
Err(e) => return Err(e.to_string().into()),
|
||||
};
|
||||
|
||||
let move_update = update.can_be_moved();
|
||||
let path = update.into_path_buf();
|
||||
let path = path.to_string_lossy();
|
||||
let path = update.path().to_string_lossy();
|
||||
let options = IngestExternalFileOptions::new();
|
||||
// options.move_files(move_update);
|
||||
|
||||
let mut options = IngestExternalFileOptions::new();
|
||||
options.move_files(move_update);
|
||||
|
||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
||||
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
|
||||
|
||||
// Compacting to trigger the merge operator only one time
|
||||
// while ingesting the update and not each time searching
|
||||
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
|
||||
let (elapsed, result) = elapsed::measure_time(|| {
|
||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
||||
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])
|
||||
});
|
||||
let _ = result?;
|
||||
info!("ingesting update file took {}", elapsed);
|
||||
|
||||
Snapshot::new(db.clone())
|
||||
};
|
||||
|
||||
// Here we will block the view creation for the minimum amount of time:
|
||||
// updating the DatabaseView itself with the new database snapshot
|
||||
let view = DatabaseView::new(snapshot)?;
|
||||
match self.view.write() {
|
||||
Ok(mut lock) => *lock = view,
|
||||
Err(e) => return Err(e.to_string().into()),
|
||||
}
|
||||
let view = Arc::new(DatabaseView::new(snapshot)?);
|
||||
self.view.set(view.clone());
|
||||
|
||||
Ok(())
|
||||
Ok(view)
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
|
||||
@ -155,105 +183,508 @@ impl Database {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn view(&self) -> RwLockReadGuard<DatabaseView<Arc<DB>>> {
|
||||
self.view.read().unwrap()
|
||||
pub fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
|
||||
self.view.get()
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
if key != DATA_INDEX {
|
||||
panic!("The merge operator only supports \"data-index\" merging")
|
||||
}
|
||||
|
||||
let capacity = {
|
||||
let remaining = operands.size_hint().0;
|
||||
let already_exist = usize::from(existing_value.is_some());
|
||||
remaining + already_exist
|
||||
};
|
||||
|
||||
let mut op = blob::OpBuilder::with_capacity(capacity);
|
||||
if let Some(existing_value) = existing_value {
|
||||
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
|
||||
op.push(Blob::Positive(blob));
|
||||
}
|
||||
|
||||
for bytes in operands {
|
||||
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
|
||||
op.push(blob);
|
||||
}
|
||||
|
||||
let blob = op.merge().expect("BUG: could not merge blobs");
|
||||
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use hashbrown::HashSet;
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::update::PositiveUpdateBuilder;
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
use crate::database::update::UpdateBuilder;
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
|
||||
#[test]
|
||||
fn ingest_update_file() -> Result<(), Box<Error>> {
|
||||
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct SimpleDoc {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
timestamp: u64,
|
||||
}
|
||||
|
||||
let schema = {
|
||||
let mut builder = SchemaBuilder::new();
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("id", STORED);
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
builder.new_attribute("timestamp", STORED);
|
||||
builder.build()
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, schema.clone())?;
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let database = Database::create(&rocksdb_path, &schema)?;
|
||||
|
||||
let update_path = dir.path().join("update.sst");
|
||||
|
||||
let doc0 = SimpleDoc {
|
||||
id: 0,
|
||||
title: String::from("I am a title"),
|
||||
description: String::from("I am a description"),
|
||||
timestamp: 1234567,
|
||||
};
|
||||
let doc1 = SimpleDoc {
|
||||
id: 1,
|
||||
title: String::from("I am the second title"),
|
||||
description: String::from("I am the second description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let mut update = {
|
||||
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
|
||||
let docid0;
|
||||
let docid1;
|
||||
let update = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
builder.update(0, &doc0).unwrap();
|
||||
builder.update(1, &doc1).unwrap();
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
update.set_move(true);
|
||||
database.ingest_update_file(update)?;
|
||||
let view = database.view();
|
||||
|
||||
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
|
||||
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||
|
||||
assert_eq!(doc0, de_doc0);
|
||||
assert_eq!(doc1, de_doc1);
|
||||
|
||||
Ok(dir.close()?)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct SimpleDoc {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
timestamp: u64,
|
||||
}
|
||||
|
||||
let schema = {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("id", STORED);
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
builder.new_attribute("timestamp", STORED);
|
||||
builder.build()
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, &schema)?;
|
||||
|
||||
let doc0 = SimpleDoc {
|
||||
id: 0,
|
||||
title: String::from("I am a title"),
|
||||
description: String::from("I am a description"),
|
||||
timestamp: 1234567,
|
||||
};
|
||||
let doc1 = SimpleDoc {
|
||||
id: 1,
|
||||
title: String::from("I am the second title"),
|
||||
description: String::from("I am the second description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
let doc2 = SimpleDoc {
|
||||
id: 2,
|
||||
title: String::from("I am the third title"),
|
||||
description: String::from("I am the third description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
let doc3 = SimpleDoc {
|
||||
id: 3,
|
||||
title: String::from("I am the fourth title"),
|
||||
description: String::from("I am the fourth description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let docid0;
|
||||
let docid1;
|
||||
let update1 = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = dir.path().join("update-000.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
let docid2;
|
||||
let docid3;
|
||||
let update2 = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = dir.path().join("update-001.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
database.ingest_update_file(update1)?;
|
||||
database.ingest_update_file(update2)?;
|
||||
|
||||
let view = database.view();
|
||||
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||
|
||||
assert_eq!(doc0, de_doc0);
|
||||
assert_eq!(doc1, de_doc1);
|
||||
|
||||
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
|
||||
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
|
||||
|
||||
assert_eq!(doc2, de_doc2);
|
||||
assert_eq!(doc3, de_doc3);
|
||||
|
||||
Ok(dir.close()?)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "nightly", test))]
|
||||
mod bench {
|
||||
extern crate test;
|
||||
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
use std::iter::repeat_with;
|
||||
use self::test::Bencher;
|
||||
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand_xorshift::XorShiftRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use serde_derive::Serialize;
|
||||
use rand::seq::SliceRandom;
|
||||
use hashbrown::HashSet;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::update::UpdateBuilder;
|
||||
use crate::database::schema::*;
|
||||
|
||||
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
|
||||
let mut words = String::new();
|
||||
|
||||
for i in 0..number {
|
||||
let word_len = rng.gen_range(1, 12);
|
||||
let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
|
||||
words.extend(iter);
|
||||
|
||||
if i == number - 1 { // last word
|
||||
let final_ = [".", "?", "!", "..."].choose(rng).cloned();
|
||||
words.extend(final_);
|
||||
} else {
|
||||
let middle = [",", ", "].choose(rng).cloned();
|
||||
words.extend(middle);
|
||||
}
|
||||
}
|
||||
|
||||
words
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..300 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
bench.iter(|| {
|
||||
let database = Database::open(db_path.clone()).unwrap();
|
||||
test::black_box(|| database);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..3000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
bench.iter(|| {
|
||||
let database = Database::open(db_path.clone()).unwrap();
|
||||
test::black_box(|| database);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..30_000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
bench.iter(|| {
|
||||
let database = Database::open(db_path.clone()).unwrap();
|
||||
test::black_box(|| database);
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..300 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
let view = database.ingest_update_file(update)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..3000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
let view = database.ingest_update_file(update)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..30_000 {
|
||||
let document = Document {
|
||||
id: i,
|
||||
title: random_sentences(rng.gen_range(1, 8), &mut rng),
|
||||
description: random_sentences(rng.gen_range(20, 200), &mut rng),
|
||||
};
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
let view = database.ingest_update_file(update)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -1,29 +1,36 @@
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::io::{Read, Write};
|
||||
use std::{fmt, u32};
|
||||
use std::path::Path;
|
||||
use std::error::Error;
|
||||
use std::{fmt, u16};
|
||||
use std::ops::BitOr;
|
||||
use std::sync::Arc;
|
||||
use std::fs::File;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
|
||||
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct SchemaProps {
|
||||
#[serde(default)]
|
||||
stored: bool,
|
||||
|
||||
#[serde(default)]
|
||||
indexed: bool,
|
||||
}
|
||||
|
||||
impl SchemaProps {
|
||||
pub fn is_stored(&self) -> bool {
|
||||
pub fn is_stored(self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
pub fn is_indexed(self) -> bool {
|
||||
self.indexed
|
||||
}
|
||||
}
|
||||
@ -39,33 +46,39 @@ impl BitOr for SchemaProps {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SchemaBuilder {
|
||||
attrs: LinkedHashMap<String, SchemaProps>,
|
||||
identifier: String,
|
||||
attributes: LinkedHashMap<String, SchemaProps>,
|
||||
}
|
||||
|
||||
impl SchemaBuilder {
|
||||
pub fn new() -> SchemaBuilder {
|
||||
SchemaBuilder { attrs: LinkedHashMap::new() }
|
||||
pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
|
||||
SchemaBuilder {
|
||||
identifier: name.into(),
|
||||
attributes: LinkedHashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
|
||||
let len = self.attrs.len();
|
||||
if self.attrs.insert(name.into(), props).is_some() {
|
||||
let len = self.attributes.len();
|
||||
if self.attributes.insert(name.into(), props).is_some() {
|
||||
panic!("Field already inserted.")
|
||||
}
|
||||
SchemaAttr(len as u32)
|
||||
SchemaAttr(len as u16)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Schema {
|
||||
let mut attrs = HashMap::new();
|
||||
let mut props = Vec::new();
|
||||
|
||||
for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
|
||||
attrs.insert(name.clone(), SchemaAttr(i as u32));
|
||||
for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
|
||||
attrs.insert(name.clone(), SchemaAttr(i as u16));
|
||||
props.push((name, prop));
|
||||
}
|
||||
|
||||
Schema { inner: Arc::new(InnerSchema { attrs, props }) }
|
||||
let identifier = self.identifier;
|
||||
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
|
||||
}
|
||||
}
|
||||
|
||||
@ -76,69 +89,124 @@ pub struct Schema {
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct InnerSchema {
|
||||
identifier: String,
|
||||
attrs: HashMap<String, SchemaAttr>,
|
||||
props: Vec<(String, SchemaProps)>,
|
||||
}
|
||||
|
||||
impl Schema {
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> bincode::Result<Schema> {
|
||||
let file = File::open(path)?;
|
||||
Schema::read_from(file)
|
||||
}
|
||||
|
||||
pub fn read_from<R: Read>(reader: R) -> bincode::Result<Schema> {
|
||||
let attrs = bincode::deserialize_from(reader)?;
|
||||
let builder = SchemaBuilder { attrs };
|
||||
pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
|
||||
let mut buffer = Vec::new();
|
||||
reader.read_to_end(&mut buffer)?;
|
||||
let builder: SchemaBuilder = toml::from_slice(&buffer)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
||||
pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
let builder = SchemaBuilder { identifier, attributes };
|
||||
|
||||
let string = toml::to_string_pretty(&builder)?;
|
||||
writer.write_all(string.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
|
||||
let mut buffer = Vec::new();
|
||||
reader.read_to_end(&mut buffer)?;
|
||||
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
let builder = SchemaBuilder { identifier, attributes };
|
||||
let string = serde_json::to_string_pretty(&builder)?;
|
||||
writer.write_all(string.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
|
||||
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
pub(crate) fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
let builder = SchemaBuilder { identifier, attributes };
|
||||
|
||||
bincode::serialize_into(writer, &builder)
|
||||
}
|
||||
|
||||
fn attributes_ordered(&self) -> LinkedHashMap<String, SchemaProps> {
|
||||
let mut ordered = BTreeMap::new();
|
||||
for (name, field) in &self.inner.attrs {
|
||||
let index = field.as_u32();
|
||||
let (_, props) = self.inner.props[index as usize];
|
||||
ordered.insert(index, (name, props));
|
||||
for (name, attr) in &self.inner.attrs {
|
||||
let (_, props) = self.inner.props[attr.0 as usize];
|
||||
ordered.insert(attr.0, (name, props));
|
||||
}
|
||||
|
||||
let mut attrs = LinkedHashMap::with_capacity(ordered.len());
|
||||
let mut attributes = LinkedHashMap::with_capacity(ordered.len());
|
||||
for (_, (name, props)) in ordered {
|
||||
attrs.insert(name, props);
|
||||
attributes.insert(name.clone(), props);
|
||||
}
|
||||
|
||||
bincode::serialize_into(writer, &attrs)
|
||||
attributes
|
||||
}
|
||||
|
||||
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let id_attribute_name = &self.inner.identifier;
|
||||
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
||||
document.serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
|
||||
let index = attr.as_u32();
|
||||
let (_, props) = self.inner.props[index as usize];
|
||||
let (_, props) = self.inner.props[attr.0 as usize];
|
||||
props
|
||||
}
|
||||
|
||||
pub fn identifier_name(&self) -> &str {
|
||||
&self.inner.identifier
|
||||
}
|
||||
|
||||
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
|
||||
self.inner.attrs.get(name.as_ref()).cloned()
|
||||
}
|
||||
|
||||
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
|
||||
let index = attr.as_u32();
|
||||
let (name, _) = &self.inner.props[index as usize];
|
||||
let (name, _) = &self.inner.props[attr.0 as usize];
|
||||
name
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
|
||||
pub struct SchemaAttr(u32);
|
||||
pub struct SchemaAttr(pub(crate) u16);
|
||||
|
||||
impl SchemaAttr {
|
||||
pub fn new(value: u32) -> SchemaAttr {
|
||||
pub fn new(value: u16) -> SchemaAttr {
|
||||
SchemaAttr(value)
|
||||
}
|
||||
|
||||
pub fn max() -> SchemaAttr {
|
||||
SchemaAttr(u32::MAX)
|
||||
pub fn min() -> SchemaAttr {
|
||||
SchemaAttr(0)
|
||||
}
|
||||
|
||||
pub fn as_u32(&self) -> u32 {
|
||||
self.0
|
||||
pub fn next(self) -> Option<SchemaAttr> {
|
||||
self.0.checked_add(1).map(SchemaAttr)
|
||||
}
|
||||
|
||||
pub fn prev(self) -> Option<SchemaAttr> {
|
||||
self.0.checked_sub(1).map(SchemaAttr)
|
||||
}
|
||||
|
||||
pub fn max() -> SchemaAttr {
|
||||
SchemaAttr(u16::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
@ -151,22 +219,92 @@ impl fmt::Display for SchemaAttr {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> bincode::Result<()> {
|
||||
let mut builder = SchemaBuilder::new();
|
||||
builder.new_attribute("alphabet", STORED);
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", STORED);
|
||||
builder.new_attribute("beta", STORED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
schema.write_to(&mut buffer)?;
|
||||
let schema2 = Schema::read_from(buffer.as_slice())?;
|
||||
schema.write_to_bin(&mut buffer)?;
|
||||
let schema2 = Schema::read_from_bin(buffer.as_slice())?;
|
||||
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize_toml() -> Result<(), Box<Error>> {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", STORED);
|
||||
builder.new_attribute("beta", STORED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
schema.to_toml(&mut buffer)?;
|
||||
|
||||
let schema2 = Schema::from_toml(buffer.as_slice())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
let data = r#"
|
||||
identifier = "id"
|
||||
|
||||
[attributes."alpha"]
|
||||
stored = true
|
||||
|
||||
[attributes."beta"]
|
||||
stored = true
|
||||
indexed = true
|
||||
|
||||
[attributes."gamma"]
|
||||
indexed = true
|
||||
"#;
|
||||
let schema2 = Schema::from_toml(data.as_bytes())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize_json() -> Result<(), Box<Error>> {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", STORED);
|
||||
builder.new_attribute("beta", STORED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
schema.to_json(&mut buffer)?;
|
||||
|
||||
let schema2 = Schema::from_json(buffer.as_slice())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
let data = r#"
|
||||
{
|
||||
"identifier": "id",
|
||||
"attributes": {
|
||||
"alpha": {
|
||||
"stored": true
|
||||
},
|
||||
"beta": {
|
||||
"stored": true,
|
||||
"indexed": true
|
||||
},
|
||||
"gamma": {
|
||||
"indexed": true
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
let schema2 = Schema::from_json(data.as_bytes())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
243
src/database/serde/find_id.rs
Normal file
243
src/database/serde/find_id.rs
Normal file
@ -0,0 +1,243 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::serde::{SerializerError, calculate_hash};
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct FindDocumentIdSerializer<'a> {
|
||||
pub id_attribute_name: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = FindDocumentIdMapSerializer<'a>;
|
||||
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(FindDocumentIdMapSerializer {
|
||||
id_attribute_name: self.id_attribute_name,
|
||||
document_id: None,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(FindDocumentIdStructSerializer {
|
||||
id_attribute_name: self.id_attribute_name,
|
||||
document_id: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FindDocumentIdMapSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
|
||||
if self.id_attribute_name == key {
|
||||
// TODO is it possible to have multiple ids?
|
||||
let id = bincode::serialize(value).unwrap();
|
||||
let hash = calculate_hash(&id);
|
||||
self.document_id = Some(DocumentId(hash));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
match self.document_id {
|
||||
Some(document_id) => Ok(document_id),
|
||||
None => Err(SerializerError::DocumentIdNotFound)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FindDocumentIdStructSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if self.id_attribute_name == key {
|
||||
// TODO can it be possible to have multiple ids?
|
||||
let id = bincode::serialize(value).unwrap();
|
||||
let hash = calculate_hash(&id);
|
||||
self.document_id = Some(DocumentId(hash));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
match self.document_id {
|
||||
Some(document_id) => Ok(document_id),
|
||||
None => Err(SerializerError::DocumentIdNotFound)
|
||||
}
|
||||
}
|
||||
}
|
196
src/database/serde/indexer_serializer.rs
Normal file
196
src/database/serde/indexer_serializer.rs
Normal file
@ -0,0 +1,196 @@
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::Token;
|
||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
pub struct IndexerSerializer<'a, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for token in self.tokenizer_builder.build(v) {
|
||||
let Token { word, word_index, char_index } = token;
|
||||
let document_id = self.document_id;
|
||||
|
||||
// FIXME must u32::try_from instead
|
||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
||||
Ok(attribute) => attribute,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
let length = word.chars().count() as u16;
|
||||
|
||||
if self.stop_words.contains(&word_lower) { continue }
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
let word_area = match WordArea::new(char_index as u32, length) {
|
||||
Ok(word_area) => word_area,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
|
||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
||||
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
||||
}
|
||||
|
||||
let word_area = match WordArea::new(char_index as u32, length) {
|
||||
Ok(word_area) => word_area,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
|
||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
||||
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "seq" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
146
src/database/serde/key_to_string.rs
Normal file
146
src/database/serde/key_to_string.rs
Normal file
@ -0,0 +1,146 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::SerializerError;
|
||||
|
||||
pub struct KeyToStringSerializer;
|
||||
|
||||
impl ser::Serializer for KeyToStringSerializer {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
57
src/database/serde/mod.rs
Normal file
57
src/database/serde/mod.rs
Normal file
@ -0,0 +1,57 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use serde::ser;
|
||||
|
||||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
pub mod find_id;
|
||||
pub mod key_to_string;
|
||||
pub mod serializer;
|
||||
pub mod indexer_serializer;
|
||||
|
||||
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s = DefaultHasher::new();
|
||||
t.hash(&mut s);
|
||||
s.finish()
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
DocumentIdNotFound,
|
||||
UnserializableType { name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::DocumentIdNotFound => {
|
||||
write!(f, "serialized document does not have an id according to the schema")
|
||||
}
|
||||
SerializerError::UnserializableType { name } => {
|
||||
write!(f, "Only struct and map types are considered valid documents and
|
||||
can be serialized, not {} types directly.", name)
|
||||
},
|
||||
SerializerError::Custom(s) => f.write_str(&s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
286
src/database/serde/serializer.rs
Normal file
286
src/database/serde/serializer.rs
Normal file
@ -0,0 +1,286 @@
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a, B>;
|
||||
type SerializeStruct = StructSerializer<'a, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(MapSerializer {
|
||||
schema: self.schema,
|
||||
document_id: self.document_id,
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
update: self.update,
|
||||
document_id: self.document_id,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
pub current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
|
||||
if let Some(attr) = self.schema.attribute(key) {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
self.update.insert_attribute_value(attr, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if let Some(attr) = self.schema.attribute(key) {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
self.update.insert_attribute_value(attr, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
stop_words: self.stop_words,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
64
src/database/update/builder.rs
Normal file
64
src/database/update/builder.rs
Normal file
@ -0,0 +1,64 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::Schema;
|
||||
|
||||
use crate::DocumentId;
|
||||
use super::{Update, RawUpdateBuilder};
|
||||
|
||||
pub struct UpdateBuilder {
|
||||
schema: Schema,
|
||||
raw_builder: RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl UpdateBuilder {
|
||||
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
|
||||
UpdateBuilder {
|
||||
schema: schema,
|
||||
raw_builder: RawUpdateBuilder::new(path),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_document<T, B>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
let update = self.raw_builder.document_update(document_id);
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
update: update,
|
||||
stop_words: stop_words,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
self.raw_builder.document_update(document_id).remove();
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
self.raw_builder.build()
|
||||
}
|
||||
}
|
@ -1,35 +1,17 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
mod negative;
|
||||
mod positive;
|
||||
mod builder;
|
||||
mod raw_builder;
|
||||
|
||||
pub use self::positive::{PositiveUpdateBuilder, NewState};
|
||||
pub use self::negative::NegativeUpdateBuilder;
|
||||
pub use self::builder::UpdateBuilder;
|
||||
pub use self::raw_builder::{RawUpdateBuilder, DocumentUpdate};
|
||||
|
||||
pub struct Update {
|
||||
path: PathBuf,
|
||||
can_be_moved: bool,
|
||||
sst_file: PathBuf,
|
||||
}
|
||||
|
||||
impl Update {
|
||||
pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
|
||||
Ok(Update { path: path.into(), can_be_moved: false })
|
||||
}
|
||||
|
||||
pub fn open_and_move<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
|
||||
Ok(Update { path: path.into(), can_be_moved: true })
|
||||
}
|
||||
|
||||
pub fn set_move(&mut self, can_be_moved: bool) {
|
||||
self.can_be_moved = can_be_moved
|
||||
}
|
||||
|
||||
pub fn can_be_moved(&self) -> bool {
|
||||
self.can_be_moved
|
||||
}
|
||||
|
||||
pub fn into_path_buf(self) -> PathBuf {
|
||||
self.path
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.sst_file
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +0,0 @@
|
||||
mod update;
|
||||
mod unordered_builder;
|
||||
|
||||
pub use self::update::NegativeUpdateBuilder;
|
@ -1,37 +0,0 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt};
|
||||
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct UnorderedNegativeBlobBuilder<W> {
|
||||
doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
|
||||
wrt: W,
|
||||
}
|
||||
|
||||
impl UnorderedNegativeBlobBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
UnorderedNegativeBlobBuilder::new(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: io::Write> UnorderedNegativeBlobBuilder<W> {
|
||||
pub fn new(wrt: W) -> Self {
|
||||
Self {
|
||||
doc_ids: BTreeSet::new(),
|
||||
wrt: wrt,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, doc: DocumentId) -> bool {
|
||||
self.doc_ids.insert(doc)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
for id in self.doc_ids {
|
||||
self.wrt.write_u64::<NativeEndian>(id)?;
|
||||
}
|
||||
Ok(self.wrt)
|
||||
}
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use ::rocksdb::rocksdb_options;
|
||||
|
||||
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
|
||||
use crate::database::blob::{Blob, NegativeBlob};
|
||||
use crate::database::update::Update;
|
||||
use crate::database::DocumentKey;
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct NegativeUpdateBuilder {
|
||||
path: PathBuf,
|
||||
doc_ids: UnorderedNegativeBlobBuilder<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl NegativeUpdateBuilder {
|
||||
pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
|
||||
NegativeUpdateBuilder {
|
||||
path: path.into(),
|
||||
doc_ids: UnorderedNegativeBlobBuilder::memory(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, id: DocumentId) -> bool {
|
||||
self.doc_ids.insert(id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.path.to_string_lossy())?;
|
||||
|
||||
let bytes = self.doc_ids.into_inner()?;
|
||||
let negative_blob = NegativeBlob::from_bytes(bytes)?;
|
||||
let blob = Blob::Negative(negative_blob);
|
||||
|
||||
// write the data-index aka negative blob
|
||||
let bytes = bincode::serialize(&blob)?;
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// FIXME remove this ugly thing !
|
||||
// let Blob::Negative(negative_blob) = blob;
|
||||
let negative_blob = match blob {
|
||||
Blob::Negative(blob) => blob,
|
||||
Blob::Positive(_) => unreachable!(),
|
||||
};
|
||||
|
||||
for &document_id in negative_blob.as_ref().as_slice() {
|
||||
let start = DocumentKey::new(document_id);
|
||||
let end = start.with_attribute_max();
|
||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
Update::open(self.path)
|
||||
}
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
mod update;
|
||||
mod unordered_builder;
|
||||
|
||||
pub use self::update::{PositiveUpdateBuilder, NewState};
|
@ -1,49 +0,0 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::error::Error;
|
||||
use std::io::Write;
|
||||
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::blob::positive::PositiveBlobBuilder;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct UnorderedPositiveBlobBuilder<W, X> {
|
||||
builder: PositiveBlobBuilder<W, X>,
|
||||
map: BTreeMap<Vec<u8>, Vec<DocIndex>>,
|
||||
}
|
||||
|
||||
impl UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
Self {
|
||||
builder: PositiveBlobBuilder::memory(),
|
||||
map: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> UnorderedPositiveBlobBuilder<W, X> {
|
||||
pub fn new(map_wtr: W, doc_wtr: X) -> Result<Self, Box<Error>> {
|
||||
Ok(UnorderedPositiveBlobBuilder {
|
||||
builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?,
|
||||
map: BTreeMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn insert<K: Into<Vec<u8>>>(&mut self, input: K, doc_index: DocIndex) {
|
||||
self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> Result<(W, X), Box<Error>> {
|
||||
for (key, mut doc_indexes) in self.map {
|
||||
doc_indexes.sort_unstable();
|
||||
self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?;
|
||||
}
|
||||
self.builder.into_inner()
|
||||
}
|
||||
}
|
@ -1,514 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use ::rocksdb::rocksdb_options;
|
||||
use serde::ser::{self, Serialize};
|
||||
|
||||
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::DocumentKeyAttr;
|
||||
use crate::database::update::Update;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::database::blob::Blob;
|
||||
|
||||
pub enum NewState {
|
||||
Updated { value: Vec<u8> },
|
||||
Removed,
|
||||
}
|
||||
|
||||
pub struct PositiveUpdateBuilder<B> {
|
||||
path: PathBuf,
|
||||
schema: Schema,
|
||||
tokenizer_builder: B,
|
||||
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B> {
|
||||
pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
|
||||
PositiveUpdateBuilder {
|
||||
path: path.into(),
|
||||
schema: schema,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
builder: UnorderedPositiveBlobBuilder::memory(),
|
||||
new_states: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: id,
|
||||
tokenizer_builder: &self.tokenizer_builder,
|
||||
builder: &mut self.builder,
|
||||
new_states: &mut self.new_states
|
||||
};
|
||||
|
||||
Ok(ser::Serialize::serialize(document, serializer)?)
|
||||
}
|
||||
|
||||
// TODO value must be a field that can be indexed
|
||||
pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) {
|
||||
let value = bincode::serialize(&value).unwrap();
|
||||
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value });
|
||||
}
|
||||
|
||||
pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) {
|
||||
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
SchemaDontMatch { attribute: String },
|
||||
UnserializableType { name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::SchemaDontMatch { attribute } => {
|
||||
write!(f, "serialized document try to specify the \
|
||||
{:?} attribute that is not known by the schema", attribute)
|
||||
},
|
||||
SerializerError::UnserializableType { name } => {
|
||||
write!(f, "Only struct and map types are considered valid documents and
|
||||
can be serialized, not {} types directly.", name)
|
||||
},
|
||||
SerializerError::Custom(s) => f.write_str(&s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
struct Serializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = StructSerializer<'a, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
// Ok(MapSerializer {
|
||||
// schema: self.schema,
|
||||
// document_id: self.document_id,
|
||||
// new_states: self.new_states,
|
||||
// })
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
struct StructSerializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
match self.schema.attribute(key) {
|
||||
Some(attr) => {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.new_states.insert(key, NewState::Updated { value });
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
builder: self.builder,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
|
||||
}
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct IndexerSerializer<'a, B> {
|
||||
tokenizer_builder: &'a B,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for (index, word) in self.tokenizer_builder.build(v) {
|
||||
let doc_index = DocIndex {
|
||||
document_id: self.document_id,
|
||||
attribute: self.attribute.as_u32() as u8,
|
||||
attribute_index: index as u32,
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
self.builder.insert(word_unidecoded, doc_index);
|
||||
}
|
||||
|
||||
self.builder.insert(word_lower, doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "seq" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B> {
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.path.to_string_lossy())?;
|
||||
|
||||
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
|
||||
let blob = Blob::Positive(positive_blob);
|
||||
|
||||
// write the data-index aka positive blob
|
||||
let bytes = bincode::serialize(&blob)?;
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents fields updates
|
||||
for (key, state) in self.new_states {
|
||||
match state {
|
||||
NewState::Updated { value } => {
|
||||
file_writer.put(key.as_ref(), &value)?
|
||||
},
|
||||
NewState::Removed => file_writer.delete(key.as_ref())?,
|
||||
}
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
Update::open(self.path)
|
||||
}
|
||||
}
|
168
src/database/update/raw_builder.rs
Normal file
168
src/database/update/raw_builder.rs
Normal file
@ -0,0 +1,168 @@
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use rocksdb::rocksdb_options;
|
||||
use hashbrown::HashMap;
|
||||
use fst::map::Map;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
||||
use crate::database::{DATA_INDEX, DocumentKeyAttr};
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use super::Update;
|
||||
|
||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
type Value = Vec<u8>;
|
||||
|
||||
pub struct RawUpdateBuilder {
|
||||
sst_file: PathBuf,
|
||||
document_updates: BTreeMap<DocumentId, DocumentUpdate>,
|
||||
}
|
||||
|
||||
pub struct DocumentUpdate {
|
||||
cleared: bool,
|
||||
words_indexes: HashMap<Token, Vec<DocIndex>>,
|
||||
attributes: BTreeMap<SchemaAttr, Value>,
|
||||
}
|
||||
|
||||
impl DocumentUpdate {
|
||||
pub fn new() -> DocumentUpdate {
|
||||
DocumentUpdate {
|
||||
cleared: false,
|
||||
words_indexes: HashMap::new(),
|
||||
attributes: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self) {
|
||||
self.cleared = true;
|
||||
self.clear();
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.words_indexes.clear();
|
||||
self.attributes.clear();
|
||||
}
|
||||
|
||||
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: Vec<u8>) {
|
||||
self.attributes.insert(attr, value);
|
||||
}
|
||||
|
||||
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
|
||||
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
|
||||
}
|
||||
}
|
||||
|
||||
impl RawUpdateBuilder {
|
||||
pub fn new(path: PathBuf) -> RawUpdateBuilder {
|
||||
RawUpdateBuilder {
|
||||
sst_file: path,
|
||||
document_updates: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate {
|
||||
match self.document_updates.entry(document_id) {
|
||||
Entry::Occupied(mut occupied) => {
|
||||
occupied.get_mut().clear();
|
||||
occupied.into_mut()
|
||||
},
|
||||
Entry::Vacant(vacant) => vacant.insert(DocumentUpdate::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(mut self) -> Result<Update, Box<Error>> {
|
||||
let mut removed_document_ids = Vec::new();
|
||||
let mut words_indexes = BTreeMap::new();
|
||||
|
||||
for (&id, update) in self.document_updates.iter_mut() {
|
||||
if update.cleared { removed_document_ids.push(id) }
|
||||
|
||||
for (token, indexes) in &update.words_indexes {
|
||||
words_indexes.entry(token).or_insert_with(Vec::new).extend_from_slice(indexes)
|
||||
}
|
||||
}
|
||||
|
||||
let negative = {
|
||||
let removed_document_ids = Set::new_unchecked(&removed_document_ids);
|
||||
let doc_ids = DocIds::new(removed_document_ids);
|
||||
Negative::new(doc_ids)
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut positive_builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in words_indexes {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
positive_builder.insert(key, indexes)?;
|
||||
}
|
||||
|
||||
let (map, indexes) = positive_builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let index = Index { negative, positive };
|
||||
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
index.write_to_bytes(&mut bytes);
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents attributes updates
|
||||
for (id, update) in self.document_updates {
|
||||
|
||||
let mut last_attr: Option<SchemaAttr> = None;
|
||||
for (attr, value) in update.attributes {
|
||||
|
||||
if update.cleared {
|
||||
// if there is no last attribute, remove from the first attribute
|
||||
let start_attr = match last_attr {
|
||||
Some(attr) => attr.next(),
|
||||
None => Some(SchemaAttr::min())
|
||||
};
|
||||
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
|
||||
let end = attr.prev().map(|a| DocumentKeyAttr::new(id, a));
|
||||
|
||||
// delete_range between (last_attr + 1) and (attr - 1)
|
||||
if let (Some(start), Some(end)) = (start, end) {
|
||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
||||
}
|
||||
}
|
||||
|
||||
let key = DocumentKeyAttr::new(id, attr);
|
||||
file_writer.put(key.as_ref(), &value)?;
|
||||
last_attr = Some(attr);
|
||||
}
|
||||
|
||||
if update.cleared {
|
||||
// if there is no last attribute, remove from the first attribute
|
||||
let start_attr = match last_attr {
|
||||
Some(attr) => attr.next(),
|
||||
None => Some(SchemaAttr::min())
|
||||
};
|
||||
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
|
||||
let end = DocumentKeyAttr::with_attribute_max(id);
|
||||
|
||||
// delete_range between (last_attr + 1) and attr_max
|
||||
if let Some(start) = start {
|
||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
|
||||
Ok(Update { sst_file: self.sst_file })
|
||||
}
|
||||
}
|
@ -9,17 +9,17 @@ use serde::de::DeserializeOwned;
|
||||
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::{retrieve_data_schema, retrieve_data_index};
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::deserializer::Deserializer;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::rank::QueryBuilder;
|
||||
use crate::database::index::Index;
|
||||
use crate::rank::{QueryBuilder, FilterFunc};
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct DatabaseView<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
snapshot: Snapshot<D>,
|
||||
blob: PositiveBlob,
|
||||
index: Index,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@ -28,16 +28,16 @@ where D: Deref<Target=DB>
|
||||
{
|
||||
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
|
||||
let schema = retrieve_data_schema(&snapshot)?;
|
||||
let blob = retrieve_data_index(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, blob, schema })
|
||||
let index = retrieve_data_index(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, index, schema })
|
||||
}
|
||||
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
pub fn blob(&self) -> &PositiveBlob {
|
||||
&self.blob
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
pub fn into_snapshot(self) -> Snapshot<D> {
|
||||
@ -71,19 +71,18 @@ where D: Deref<Target=DB>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn query_builder(&self) -> Result<QueryBuilder<D>, Box<Error>> {
|
||||
pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
|
||||
QueryBuilder::new(self)
|
||||
}
|
||||
|
||||
// TODO create an enum error type
|
||||
pub fn retrieve_document<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
|
||||
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
|
||||
where T: DeserializeOwned
|
||||
{
|
||||
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
|
||||
Ok(T::deserialize(&mut deserializer)?)
|
||||
}
|
||||
|
||||
pub fn retrieve_documents<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
|
||||
pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
|
||||
where T: DeserializeOwned,
|
||||
I: IntoIterator<Item=DocumentId>,
|
||||
{
|
||||
@ -100,7 +99,7 @@ where D: Deref<Target=DB>
|
||||
{
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let mut options = ReadOptions::new();
|
||||
let lower = DocumentKey::new(0);
|
||||
let lower = DocumentKey::new(DocumentId(0));
|
||||
options.set_iterate_lower_bound(lower.as_ref());
|
||||
|
||||
let mut iter = self.snapshot.iter_opt(options);
|
||||
@ -149,7 +148,7 @@ where D: Deref<Target=DB>,
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.document_ids.next() {
|
||||
Some(id) => Some(self.database_view.retrieve_document(id)),
|
||||
Some(id) => Some(self.database_view.document_by_id(id)),
|
||||
None => None
|
||||
}
|
||||
}
|
||||
@ -168,7 +167,7 @@ where D: Deref<Target=DB>,
|
||||
{
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
match self.document_ids.next_back() {
|
||||
Some(id) => Some(self.database_view.retrieve_document(id)),
|
||||
Some(id) => Some(self.database_view.document_by_id(id)),
|
||||
None => None
|
||||
}
|
||||
}
|
78
src/lib.rs
78
src/lib.rs
@ -1,40 +1,49 @@
|
||||
#![cfg_attr(feature = "nightly", feature(test))]
|
||||
|
||||
pub mod automaton;
|
||||
pub mod database;
|
||||
pub mod data;
|
||||
pub mod rank;
|
||||
pub mod tokenizer;
|
||||
pub mod vec_read_only;
|
||||
mod attribute;
|
||||
mod word_area;
|
||||
mod common_words;
|
||||
|
||||
pub use rocksdb;
|
||||
|
||||
pub use self::tokenizer::Tokenizer;
|
||||
pub use self::common_words::CommonWords;
|
||||
pub use self::attribute::{Attribute, AttributeError};
|
||||
pub use self::word_area::{WordArea, WordAreaError};
|
||||
|
||||
pub type DocumentId = u64;
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub struct DocumentId(u64);
|
||||
|
||||
/// This structure represent the position of a word
|
||||
/// in a document and its attributes.
|
||||
///
|
||||
/// This is stored in the map, generated at index time,
|
||||
/// extracted and interpreted at search time.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[repr(C)]
|
||||
pub struct DocIndex {
|
||||
/// The document identifier where the word was found.
|
||||
pub document_id: DocumentId,
|
||||
|
||||
/// The attribute identifier in the document
|
||||
/// where the word was found.
|
||||
///
|
||||
/// This is an `u8` therefore a document
|
||||
/// can not have more than `2^8` attributes.
|
||||
pub attribute: u8,
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: Attribute,
|
||||
|
||||
/// The index where the word was found in the attribute.
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// Only the first 1000 words are indexed.
|
||||
pub attribute_index: u32,
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub word_area: WordArea,
|
||||
}
|
||||
|
||||
/// This structure represent a matching word with informations
|
||||
@ -45,7 +54,7 @@ pub struct DocIndex {
|
||||
///
|
||||
/// The word in itself is not important.
|
||||
// TODO do data oriented programming ? very arrays ?
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Match {
|
||||
/// The word index in the query sentence.
|
||||
/// Same as the `attribute_index` but for the query words.
|
||||
@ -57,23 +66,19 @@ pub struct Match {
|
||||
/// (i.e. the Levenshtein distance).
|
||||
pub distance: u8,
|
||||
|
||||
/// The attribute in which the word is located
|
||||
/// (i.e. Title is 0, Description is 1).
|
||||
///
|
||||
/// This is an `u8` therefore a document
|
||||
/// can not have more than `2^8` attributes.
|
||||
pub attribute: u8,
|
||||
|
||||
/// Where does this word is located in the attribute string
|
||||
/// (i.e. at the start or the end of the attribute).
|
||||
///
|
||||
/// The index in the attribute is limited to a maximum of `2^32`
|
||||
/// this is because we index only the first 1000 words
|
||||
/// in an attribute.
|
||||
pub attribute_index: u32,
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: Attribute,
|
||||
|
||||
/// Whether the word that match is an exact match or a prefix.
|
||||
pub is_exact: bool,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub word_area: WordArea,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
@ -81,9 +86,9 @@ impl Match {
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: 0,
|
||||
attribute_index: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 0),
|
||||
}
|
||||
}
|
||||
|
||||
@ -91,9 +96,20 @@ impl Match {
|
||||
Match {
|
||||
query_index: u32::max_value(),
|
||||
distance: u8::max_value(),
|
||||
attribute: u8::max_value(),
|
||||
attribute_index: u32::max_value(),
|
||||
attribute: Attribute::max_value(),
|
||||
is_exact: true,
|
||||
word_area: WordArea::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
#[test]
|
||||
fn docindex_mem_size() {
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
||||
}
|
||||
}
|
||||
|
@ -10,13 +10,13 @@ use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
|
||||
#[inline]
|
||||
fn contains_exact(matches: &[Match]) -> bool {
|
||||
fn contains_exact(matches: &&[Match]) -> bool {
|
||||
matches.iter().any(|m| m.is_exact)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(matches: &[Match]) -> usize {
|
||||
GroupBy::new(matches, match_query_index).map(contains_exact).count()
|
||||
GroupBy::new(matches, match_query_index).filter(contains_exact).count()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
|
@ -29,7 +29,6 @@ pub use self::{
|
||||
pub trait Criterion<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
#[inline]
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
|
||||
|
||||
#[inline]
|
||||
@ -62,6 +61,7 @@ where D: Deref<Target=DB>
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct CriteriaBuilder<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
|
@ -46,13 +46,18 @@ use crate::rank::Document;
|
||||
/// let criterion = builder.build();
|
||||
///
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct SortBy<T> {
|
||||
_phantom: marker::PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T> SortBy<T> {
|
||||
pub fn new() -> Self {
|
||||
SortBy::default()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for SortBy<T> {
|
||||
fn default() -> SortBy<T> {
|
||||
SortBy { _phantom: marker::PhantomData }
|
||||
}
|
||||
}
|
||||
@ -62,12 +67,12 @@ where D: Deref<Target=DB>,
|
||||
T: DeserializeOwned + Ord,
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = match view.retrieve_document::<T>(lhs.id) {
|
||||
let lhs = match view.document_by_id::<T>(lhs.id) {
|
||||
Ok(doc) => Some(doc),
|
||||
Err(e) => { eprintln!("{}", e); None },
|
||||
};
|
||||
|
||||
let rhs = match view.retrieve_document::<T>(rhs.id) {
|
||||
let rhs = match view.document_by_id::<T>(rhs.id) {
|
||||
Ok(doc) => Some(doc),
|
||||
Err(e) => { eprintln!("{}", e); None },
|
||||
};
|
||||
|
@ -11,14 +11,14 @@ use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(matches: &[Match]) -> i8 {
|
||||
fn sum_matches_typos(matches: &[Match]) -> isize {
|
||||
let mut sum_typos = 0;
|
||||
let mut number_words = 0;
|
||||
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
for group in GroupBy::new(matches, match_query_index) {
|
||||
sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
|
||||
sum_typos += unsafe { group.get_unchecked(0).distance as isize };
|
||||
number_words += 1;
|
||||
}
|
||||
|
||||
@ -44,6 +44,8 @@ where D: Deref<Target=DB>
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::{DocumentId, Attribute, WordArea};
|
||||
|
||||
// typing: "Geox CEO"
|
||||
//
|
||||
// doc0: "Geox SpA: CEO and Executive"
|
||||
@ -52,22 +54,46 @@ mod tests {
|
||||
fn one_typo_reference() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 2),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: 0,
|
||||
id: DocumentId(0),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 1,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 2),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: 1,
|
||||
id: DocumentId(1),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
@ -85,21 +111,39 @@ mod tests {
|
||||
fn no_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false },
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 1),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: 0,
|
||||
id: DocumentId(0),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: 1,
|
||||
id: DocumentId(1),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
@ -117,21 +161,39 @@ mod tests {
|
||||
fn one_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false },
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 1,
|
||||
attribute: Attribute::new_faillible(0, 1),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: 0,
|
||||
id: DocumentId(0),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: 1,
|
||||
id: DocumentId(1),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion;
|
||||
use crate::Match;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(matches: &[Match]) -> u8 {
|
||||
fn sum_matches_attributes(matches: &[Match]) -> usize {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| unsafe {
|
||||
group.get_unchecked(0).attribute
|
||||
GroupBy::new(matches, match_query_index).map(|group| {
|
||||
unsafe { group.get_unchecked(0).attribute.attribute() as usize }
|
||||
}).sum()
|
||||
}
|
||||
|
||||
|
@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion;
|
||||
use crate::Match;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
|
||||
fn sum_matches_attribute_index(matches: &[Match]) -> usize {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| unsafe {
|
||||
group.get_unchecked(0).attribute_index
|
||||
GroupBy::new(matches, match_query_index).map(|group| {
|
||||
unsafe { group.get_unchecked(0).attribute.word_index() as usize }
|
||||
}).sum()
|
||||
}
|
||||
|
||||
|
@ -20,8 +20,8 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
}
|
||||
|
||||
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
|
||||
if lhs.attribute != rhs.attribute { return MAX_DISTANCE }
|
||||
index_proximity(lhs.attribute_index, rhs.attribute_index)
|
||||
if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
|
||||
index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
|
||||
}
|
||||
|
||||
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
|
||||
@ -67,6 +67,8 @@ where D: Deref<Target=DB>
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::Attribute;
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
|
||||
@ -79,11 +81,11 @@ mod tests {
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
||||
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
|
||||
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
|
||||
];
|
||||
|
||||
// soup -> of = 8
|
||||
@ -105,12 +107,12 @@ mod tests {
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() },
|
||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
||||
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
|
||||
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
|
||||
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
|
||||
];
|
||||
|
||||
// soup -> of = 1
|
||||
@ -119,3 +121,42 @@ mod tests {
|
||||
assert_eq!(matches_proximity(matches), 3);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "nightly", test))]
|
||||
mod bench {
|
||||
extern crate test;
|
||||
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
use self::test::Bencher;
|
||||
|
||||
use rand_xorshift::XorShiftRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
use crate::Attribute;
|
||||
|
||||
#[bench]
|
||||
fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let number_matches = 30_000;
|
||||
let mut matches = Vec::with_capacity(number_matches);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for _ in 0..number_matches {
|
||||
let query_index = rng.gen_range(0, 4);
|
||||
|
||||
let attribute = rng.gen_range(0, 5);
|
||||
let word_index = rng.gen_range(0, 15);
|
||||
let attribute = Attribute::new_faillible(attribute, word_index);
|
||||
|
||||
let match_ = Match { query_index, attribute, ..Match::zero() };
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
bench.iter(|| {
|
||||
let proximity = matches_proximity(&matches);
|
||||
test::black_box(move || proximity)
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ mod distinct_map;
|
||||
|
||||
use crate::{Match, DocumentId};
|
||||
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
|
||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||
|
||||
#[inline]
|
||||
fn match_query_index(a: &Match, b: &Match) -> bool {
|
||||
|
@ -4,10 +4,11 @@ use std::error::Error;
|
||||
use std::hash::Hash;
|
||||
use std::rc::Rc;
|
||||
|
||||
use group_by::GroupByMut;
|
||||
use group_by::BinaryGroupByMut;
|
||||
use hashbrown::HashMap;
|
||||
use fst::Streamer;
|
||||
use rocksdb::DB;
|
||||
use log::info;
|
||||
|
||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
@ -34,14 +35,17 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||
automatons
|
||||
}
|
||||
|
||||
pub struct QueryBuilder<'a, D>
|
||||
pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
|
||||
|
||||
pub struct QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
view: &'a DatabaseView<D>,
|
||||
criteria: Criteria<D>,
|
||||
filter: Option<FI>,
|
||||
}
|
||||
|
||||
impl<'a, D> QueryBuilder<'a, D>
|
||||
impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
|
||||
@ -49,19 +53,27 @@ where D: Deref<Target=DB>
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D> QueryBuilder<'a, D>
|
||||
where D: Deref<Target=DB>
|
||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>,
|
||||
{
|
||||
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> {
|
||||
Ok(QueryBuilder { view, criteria })
|
||||
Ok(QueryBuilder { view, criteria, filter: None })
|
||||
}
|
||||
|
||||
pub fn criteria(&mut self, criteria: Criteria<D>) -> &mut Self {
|
||||
self.criteria = criteria;
|
||||
self
|
||||
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
{
|
||||
QueryBuilder {
|
||||
view: self.view,
|
||||
criteria: self.criteria,
|
||||
filter: Some(function)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_distinct<F>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, F> {
|
||||
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
inner: self,
|
||||
function: function,
|
||||
@ -75,12 +87,13 @@ where D: Deref<Target=DB>
|
||||
let mut stream = {
|
||||
let mut op_builder = fst::map::OpBuilder::new();
|
||||
for automaton in &automatons {
|
||||
let stream = self.view.blob().as_map().search(automaton);
|
||||
let stream = self.view.index().positive.map().search(automaton);
|
||||
op_builder.push(stream);
|
||||
}
|
||||
op_builder.union()
|
||||
};
|
||||
|
||||
let mut number_matches = 0;
|
||||
let mut matches = HashMap::new();
|
||||
|
||||
while let Some((input, indexed_values)) = stream.next() {
|
||||
@ -89,39 +102,55 @@ where D: Deref<Target=DB>
|
||||
let distance = automaton.eval(input).to_u8();
|
||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||
|
||||
let doc_indexes = self.view.blob().as_indexes();
|
||||
let doc_indexes = &self.view.index().positive.indexes();
|
||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||
|
||||
number_matches += doc_indexes.len();
|
||||
for doc_index in doc_indexes {
|
||||
let match_ = Match {
|
||||
query_index: iv.index as u32,
|
||||
distance: distance,
|
||||
attribute: doc_index.attribute,
|
||||
attribute_index: doc_index.attribute_index,
|
||||
is_exact: is_exact,
|
||||
word_area: doc_index.word_area,
|
||||
};
|
||||
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matches.into_iter().map(|(id, matches)| Document::from_matches(id, matches)).collect()
|
||||
info!("{} total documents to classify", matches.len());
|
||||
info!("{} total matches to classify", number_matches);
|
||||
|
||||
matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D> QueryBuilder<'a, D>
|
||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>,
|
||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
{
|
||||
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
let mut documents = self.query_all(query);
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
// We give the filtering work to the query distinct builder,
|
||||
// specifying a distinct rule that has no effect.
|
||||
if self.filter.is_some() {
|
||||
let builder = self.with_distinct(|_, _| None as Option<()>, 1);
|
||||
return builder.query(query, range);
|
||||
}
|
||||
|
||||
let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query));
|
||||
info!("query_all took {}", elapsed);
|
||||
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
let view = &self.view;
|
||||
|
||||
'criteria: for criterion in self.criteria.as_ref() {
|
||||
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < range.start {
|
||||
@ -130,9 +159,12 @@ where D: Deref<Target=DB>,
|
||||
continue;
|
||||
}
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
||||
let (elapsed, ()) = elapsed::measure_time(|| {
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
||||
});
|
||||
info!("criterion {} sort took {}", ci, elapsed);
|
||||
|
||||
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
@ -152,25 +184,41 @@ where D: Deref<Target=DB>,
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DistinctQueryBuilder<'a, D, F>
|
||||
pub struct DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
inner: QueryBuilder<'a, D>,
|
||||
function: F,
|
||||
inner: QueryBuilder<'a, D, FI>,
|
||||
function: FD,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
impl<'a, D, F, K> DistinctQueryBuilder<'a, D, F>
|
||||
impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>,
|
||||
F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
{
|
||||
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
inner: self.inner.with_filter(function),
|
||||
function: self.function,
|
||||
size: self.size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>,
|
||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
let mut documents = self.inner.query_all(query);
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
let mut key_cache = HashMap::new();
|
||||
let view = &self.inner.view;
|
||||
|
||||
let mut filter_map = HashMap::new();
|
||||
// these two variables informs on the current distinct map and
|
||||
// on the raw offset of the start of the group where the
|
||||
// range.start bound is located according to the distinct function
|
||||
@ -193,17 +241,27 @@ where D: Deref<Target=DB>,
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
||||
|
||||
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
// we must compute the real distinguished len of this sub-group
|
||||
for document in group.iter() {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
None => buf_distinct.register_without_key(),
|
||||
let filter_accepted = match &self.inner.filter {
|
||||
Some(filter) => {
|
||||
let entry = filter_map.entry(document.id);
|
||||
*entry.or_insert_with(|| (filter)(document.id, view))
|
||||
},
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
None => buf_distinct.register_without_key(),
|
||||
};
|
||||
}
|
||||
|
||||
// the requested range end is reached: stop computing distinct
|
||||
if buf_distinct.len() >= range.end { break }
|
||||
}
|
||||
@ -229,16 +287,22 @@ where D: Deref<Target=DB>,
|
||||
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
|
||||
|
||||
for document in documents.into_iter().skip(distinct_raw_offset) {
|
||||
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
|
||||
|
||||
let accepted = match key {
|
||||
Some(key) => seen.register(key),
|
||||
None => seen.register_without_key(),
|
||||
let filter_accepted = match &self.inner.filter {
|
||||
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
|
||||
None => true,
|
||||
};
|
||||
|
||||
if accepted && seen.len() > range.start {
|
||||
out_documents.push(document);
|
||||
if out_documents.len() == range.len() { break }
|
||||
if filter_accepted {
|
||||
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
|
||||
let distinct_accepted = match key {
|
||||
Some(key) => seen.register(key),
|
||||
None => seen.register_without_key(),
|
||||
};
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
out_documents.push(document);
|
||||
if out_documents.len() == range.len() { break }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@ use std::mem;
|
||||
use self::Separator::*;
|
||||
|
||||
pub trait TokenizerBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
|
||||
}
|
||||
|
||||
pub struct DefaultBuilder;
|
||||
@ -13,22 +13,39 @@ impl DefaultBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Token<'a> {
|
||||
pub word: &'a str,
|
||||
pub word_index: usize,
|
||||
pub char_index: usize,
|
||||
}
|
||||
|
||||
impl TokenizerBuilder for DefaultBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
|
||||
Box::new(Tokenizer::new(text))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Tokenizer<'a> {
|
||||
index: usize,
|
||||
word_index: usize,
|
||||
char_index: usize,
|
||||
inner: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> {
|
||||
pub fn new(string: &str) -> Tokenizer {
|
||||
let mut char_advance = 0;
|
||||
let mut index_advance = 0;
|
||||
for (n, (i, c)) in string.char_indices().enumerate() {
|
||||
char_advance = n;
|
||||
index_advance = i;
|
||||
if detect_separator(c).is_none() { break }
|
||||
}
|
||||
|
||||
Tokenizer {
|
||||
index: 0,
|
||||
inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]),
|
||||
word_index: 0,
|
||||
char_index: char_advance,
|
||||
inner: &string[index_advance..],
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -56,43 +73,58 @@ impl Separator {
|
||||
}
|
||||
}
|
||||
|
||||
fn detect_separator(c: char) -> Option<Separator> {
|
||||
match c {
|
||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
||||
' ' | '\'' | '"' => Some(Short),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Tokenizer<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
type Item = Token<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut start_word = None;
|
||||
let mut distance = None;
|
||||
|
||||
for (i, c) in self.inner.char_indices() {
|
||||
let separator = match c {
|
||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
||||
' ' | '\'' | '"' => Some(Short),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
match separator {
|
||||
Some(dist) => {
|
||||
match detect_separator(c) {
|
||||
Some(sep) => {
|
||||
if let Some(start_word) = start_word {
|
||||
let (word, tail) = self.inner.split_at(i);
|
||||
let (prefix, tail) = self.inner.split_at(i);
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
self.inner = tail;
|
||||
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
self.char_index += spaces.chars().count();
|
||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let word = &word[start_word..];
|
||||
return Some((self.index, word))
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index,
|
||||
char_index: self.char_index,
|
||||
};
|
||||
|
||||
self.char_index += word.chars().count();
|
||||
return Some(token)
|
||||
}
|
||||
distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
|
||||
|
||||
distance.replace(distance.map_or(sep, |s| s.add(sep)));
|
||||
},
|
||||
None => { start_word.get_or_insert(i); },
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(start_word) = start_word {
|
||||
let word = mem::replace(&mut self.inner, "");
|
||||
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
let prefix = mem::replace(&mut self.inner, "");
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
let word = &word[start_word..];
|
||||
return Some((self.index, word))
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
|
||||
char_index: self.char_index + spaces.chars().count(),
|
||||
};
|
||||
return Some(token)
|
||||
}
|
||||
|
||||
None
|
||||
@ -107,12 +139,12 @@ mod tests {
|
||||
fn easy() {
|
||||
let mut tokenizer = Tokenizer::new("salut");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "salut")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
@ -120,18 +152,37 @@ mod tests {
|
||||
fn hard() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
||||
assert_eq!(tokenizer.next(), Some((1, "lolo")));
|
||||
assert_eq!(tokenizer.next(), Some((9, "aïe")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
||||
assert_eq!(tokenizer.next(), Some((8, "lolo")));
|
||||
assert_eq!(tokenizer.next(), Some((16, "wtf")));
|
||||
assert_eq!(tokenizer.next(), Some((24, "lol")));
|
||||
assert_eq!(tokenizer.next(), Some((32, "aïe")));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard_long_chars() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
}
|
||||
|
@ -1,51 +0,0 @@
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct VecReadOnly<T> {
|
||||
inner: Arc<Vec<T>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<T> VecReadOnly<T> {
|
||||
pub fn new(vec: Vec<T>) -> Self {
|
||||
let len = vec.len();
|
||||
Self {
|
||||
inner: Arc::new(vec),
|
||||
offset: 0,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> Self {
|
||||
Self {
|
||||
inner: self.inner.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[T] {
|
||||
&self.inner[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for VecReadOnly<T> {
|
||||
type Target = [T];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: fmt::Debug> fmt::Debug for VecReadOnly<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.inner.fmt(f)
|
||||
}
|
||||
}
|
102
src/word_area.rs
Normal file
102
src/word_area.rs
Normal file
@ -0,0 +1,102 @@
|
||||
use std::fmt;
|
||||
|
||||
/// Represent a word position in bytes along with the length of it.
|
||||
///
|
||||
/// It can represent words byte index to maximum 2^22 and
|
||||
/// up to words of length 1024.
|
||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct WordArea(u32);
|
||||
|
||||
impl WordArea {
|
||||
/// Construct a `WordArea` from a word position in expresed as
|
||||
/// a number of characters and the length of it.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The char index must not be greater than 2^22
|
||||
/// and the length not greater than 1024.
|
||||
pub(crate) fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
|
||||
if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
||||
return Err(WordAreaError::ByteIndexTooBig)
|
||||
}
|
||||
|
||||
if length & 0b1111_1100_0000_0000 != 0 {
|
||||
return Err(WordAreaError::LengthTooBig)
|
||||
}
|
||||
|
||||
let char_index = char_index << 10;
|
||||
Ok(WordArea(char_index | u32::from(length)))
|
||||
}
|
||||
|
||||
pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea {
|
||||
match WordArea::new(char_index, length) {
|
||||
Ok(word_area) => word_area,
|
||||
Err(WordAreaError::ByteIndexTooBig) => {
|
||||
panic!("word area byte index must not be greater than 2^22")
|
||||
},
|
||||
Err(WordAreaError::LengthTooBig) => {
|
||||
panic!("word area length must not be greater than 1024")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn max_value() -> WordArea {
|
||||
WordArea(u32::max_value())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn char_index(self) -> u32 {
|
||||
self.0 >> 10
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn length(self) -> u16 {
|
||||
(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for WordArea {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("WordArea")
|
||||
.field("char_index", &self.char_index())
|
||||
.field("length", &self.length())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WordAreaError {
|
||||
ByteIndexTooBig,
|
||||
LengthTooBig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use quickcheck::{quickcheck, TestResult};
|
||||
|
||||
quickcheck! {
|
||||
fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult {
|
||||
if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let word_area = WordArea::new_faillible(gen_char_index, gen_length);
|
||||
|
||||
let valid_char_index = word_area.char_index() == gen_char_index;
|
||||
let valid_length = word_area.length() == gen_length;
|
||||
|
||||
TestResult::from_bool(valid_char_index && valid_length)
|
||||
}
|
||||
|
||||
fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult {
|
||||
if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let a = WordArea::new_faillible(gen_char_index, gen_length);
|
||||
let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1);
|
||||
|
||||
TestResult::from_bool(a < b)
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user