mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-21 05:41:01 +00:00
Compare commits
111 Commits
Author | SHA1 | Date | |
---|---|---|---|
aef7d7825f | |||
f28ce661af | |||
74eb9c8d0f | |||
d664221c64 | |||
58bff3d4ac | |||
2c206eb98c | |||
19724e5af9 | |||
c9e0ad132c | |||
24f265a963 | |||
f8a743ee00 | |||
64971de7ed | |||
a960c325f3 | |||
a799470997 | |||
10414791a2 | |||
743974e60d | |||
0e267cae4b | |||
12a352ae2f | |||
5070b27728 | |||
7a6b734078 | |||
24823da6f7 | |||
8701cb3a8f | |||
315fc1fbe3 | |||
23833bac10 | |||
8235b6efc9 | |||
7f937eea5a | |||
a1cf634ac1 | |||
c86472e997 | |||
26cb398a6f | |||
f6e664d298 | |||
9437cecf87 | |||
13309511b3 | |||
1941cb16c0 | |||
55823c5d5d | |||
4721da1679 | |||
482f750231 | |||
d5119db165 | |||
37578ed74f | |||
f5992ce822 | |||
badb0035c5 | |||
4bc14aa261 | |||
a0c4ec0be0 | |||
264fffa826 | |||
bddb37e44f | |||
6393b0cbc0 | |||
a8df438814 | |||
8014857ebf | |||
9e7261a48f | |||
c4e70d0475 | |||
cbb0aaa217 | |||
ce50e74491 | |||
e103e1c277 | |||
64929fe5dc | |||
b108f1e6c9 | |||
58b417e045 | |||
2e5a616d8e | |||
092d446a7e | |||
85a1f126bf | |||
cf58cf86da | |||
db6210c7ee | |||
83cd071827 | |||
084c3a95b6 | |||
78908aa34e | |||
cf27706f91 | |||
d3f53a7fd6 | |||
508af5613f | |||
c615c31016 | |||
908b28790b | |||
4c0279729b | |||
96dfac5b33 | |||
8576218b51 | |||
1c1f9201b8 | |||
4398b88a3a | |||
73e79f5ca4 | |||
1bfd51d6e9 | |||
0d2daf27f2 | |||
87f0d8cf3c | |||
06d5a10902 | |||
94b89c5439 | |||
c5e951be09 | |||
66ae5c8161 | |||
8438e2202f | |||
7a6166d229 | |||
d46fa4b215 | |||
2bd5b4ab86 | |||
5efbc5ceb3 | |||
2e905bac08 | |||
4c0ad5f964 | |||
455cbf3bf4 | |||
a3a28c56fa | |||
b0b3175641 | |||
c2f0df3f73 | |||
820f1f9ac6 | |||
337aee5b65 | |||
810dfdf656 | |||
f016652fca | |||
6c99ebe3fa | |||
94d357985f | |||
fbc698567a | |||
aa9db14c09 | |||
61e83a1c21 | |||
1316be5b09 | |||
4e8b0383dd | |||
4fa10753c1 | |||
2473e289e8 | |||
e0e5e87ed3 | |||
b13e61f40a | |||
c023cb3065 | |||
0a3d069fbc | |||
fa062ce2cf | |||
cdc6e47bf5 | |||
d5f44838be |
@ -11,8 +11,8 @@ matrix:
|
|||||||
include:
|
include:
|
||||||
|
|
||||||
# Test crates on their minimum Rust versions.
|
# Test crates on their minimum Rust versions.
|
||||||
- rust: 1.31.0
|
- rust: 1.32.0
|
||||||
name: "meilidb on 1.31.0"
|
name: "meilidb on 1.32.0"
|
||||||
script: ./ci/meilidb.sh
|
script: ./ci/meilidb.sh
|
||||||
|
|
||||||
# Test crates on nightly Rust.
|
# Test crates on nightly Rust.
|
||||||
|
58
Cargo.toml
58
Cargo.toml
@ -1,23 +1,28 @@
|
|||||||
[package]
|
[package]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
name = "meilidb"
|
name = "meilidb"
|
||||||
version = "0.2.0"
|
version = "0.3.2"
|
||||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
bincode = "1.0"
|
arc-swap = "0.3.7"
|
||||||
byteorder = "1.2"
|
bincode = "1.1.2"
|
||||||
crossbeam = "0.6"
|
byteorder = "1.3.1"
|
||||||
fst = "0.3"
|
fst = "0.3.3"
|
||||||
hashbrown = { version = "0.1", features = ["serde"] }
|
hashbrown = { version = "0.1.8", features = ["serde"] }
|
||||||
lazy_static = "1.1"
|
lazy_static = "1.2.0"
|
||||||
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||||
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
|
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
||||||
log = "0.4"
|
lockfree = "0.5.1"
|
||||||
sdset = "0.3"
|
log = "0.4.6"
|
||||||
serde = "1.0"
|
rayon = "1.0.3"
|
||||||
serde_derive = "1.0"
|
sdset = "0.3.1"
|
||||||
unidecode = "0.3"
|
serde = "1.0.88"
|
||||||
|
serde_derive = "1.0.88"
|
||||||
|
serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
||||||
|
size_format = "1.0.2"
|
||||||
|
slice-group-by = "0.2.4"
|
||||||
|
unidecode = "0.3.0"
|
||||||
|
|
||||||
[dependencies.toml]
|
[dependencies.toml]
|
||||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||||
@ -28,28 +33,23 @@ rev = "0372ba6"
|
|||||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||||
rev = "306e201"
|
rev = "306e201"
|
||||||
|
|
||||||
[dependencies.group-by]
|
|
||||||
git = "https://github.com/Kerollmops/group-by.git"
|
|
||||||
rev = "5a113fe"
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["simd"]
|
default = ["simd"]
|
||||||
i128 = ["bincode/i128", "byteorder/i128"]
|
i128 = ["bincode/i128", "byteorder/i128"]
|
||||||
portable = ["rocksdb/portable"]
|
portable = ["rocksdb/portable"]
|
||||||
simd = ["rocksdb/sse"]
|
simd = ["rocksdb/sse"]
|
||||||
nightly = ["hashbrown/nightly", "group-by/nightly"]
|
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
csv = "1.0"
|
csv = "1.0.5"
|
||||||
elapsed = "0.1"
|
env_logger = "0.6.0"
|
||||||
env_logger = "0.6"
|
jemallocator = "0.1.9"
|
||||||
jemallocator = "0.1"
|
quickcheck = "0.8.2"
|
||||||
quickcheck = "0.8"
|
rand = "0.6.5"
|
||||||
rand = "0.6"
|
rand_xorshift = "0.1.1"
|
||||||
rand_xorshift = "0.1"
|
structopt = "0.2.14"
|
||||||
structopt = "0.2"
|
tempfile = "3.0.7"
|
||||||
tempfile = "3.0"
|
termcolor = "1.0.4"
|
||||||
termcolor = "1.0"
|
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = true
|
debug = true
|
||||||
|
22
README.md
22
README.md
@ -10,7 +10,7 @@ A _full-text search database_ using a key-value store internally.
|
|||||||
|
|
||||||
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
|
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
|
||||||
|
|
||||||
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.
|
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries or you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents.
|
||||||
|
|
||||||
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
|
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
|
||||||
|
|
||||||
@ -22,20 +22,20 @@ MeiliDB will be a binary in a near future so you will be able to use it as a dat
|
|||||||
|
|
||||||
## Performances
|
## Performances
|
||||||
|
|
||||||
With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
|
With a database composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
|
||||||
So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.
|
So more than _300 000_ fields indexed for _35 million_ stored we can handle more than _2.8k req/sec_ with an average response time of _9 ms_ on an Intel i7-7700 (8) @ 4.2GHz.
|
||||||
|
|
||||||
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.
|
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to simulate real users queries.
|
||||||
|
|
||||||
```
|
```
|
||||||
Running 10s test @ http://localhost:2230
|
Running 10s test @ http://localhost:2230
|
||||||
2 threads and 12 connections
|
2 threads and 25 connections
|
||||||
Thread Stats Avg Stdev Max +/- Stdev
|
Thread Stats Avg Stdev Max +/- Stdev
|
||||||
Latency 18.86ms 49.39ms 614.89ms 95.23%
|
Latency 9.52ms 7.61ms 99.25ms 84.58%
|
||||||
Req/Sec 620.41 59.53 790.00 65.00%
|
Req/Sec 1.41k 119.11 1.78k 64.50%
|
||||||
12359 requests in 10.00s, 3.26MB read
|
28080 requests in 10.01s, 7.42MB read
|
||||||
Requests/sec: 1235.54
|
Requests/sec: 2806.46
|
||||||
Transfer/sec: 334.22KB
|
Transfer/sec: 759.17KB
|
||||||
```
|
```
|
||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
@ -49,7 +49,7 @@ MeiliDB runs with an index like most search engines.
|
|||||||
So to test the library you can create one by indexing a simple csv file.
|
So to test the library you can create one by indexing a simple csv file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml --stop-words misc/fr.stopwords.txt
|
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
|
||||||
```
|
```
|
||||||
|
|
||||||
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
|
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
|
||||||
|
@ -1,17 +1,18 @@
|
|||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||||
|
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::io::{self, BufRead, BufReader};
|
use std::io::{self, BufRead, BufReader};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::time::Instant;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
|
||||||
use hashbrown::{HashMap, HashSet};
|
|
||||||
use serde_derive::{Serialize, Deserialize};
|
use serde_derive::{Serialize, Deserialize};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use meilidb::database::{Database, Schema, UpdateBuilder};
|
use meilidb::database::{Database, Schema};
|
||||||
use meilidb::tokenizer::DefaultBuilder;
|
use meilidb::tokenizer::DefaultBuilder;
|
||||||
|
|
||||||
#[derive(Debug, StructOpt)]
|
#[derive(Debug, StructOpt)]
|
||||||
@ -50,7 +51,9 @@ fn index(
|
|||||||
stop_words: &HashSet<String>,
|
stop_words: &HashSet<String>,
|
||||||
) -> Result<Database, Box<Error>>
|
) -> Result<Database, Box<Error>>
|
||||||
{
|
{
|
||||||
let database = Database::create(database_path, &schema)?;
|
let database = Database::create(database_path)?;
|
||||||
|
|
||||||
|
database.create_index("default", &schema)?;
|
||||||
|
|
||||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||||
let mut raw_record = csv::StringRecord::new();
|
let mut raw_record = csv::StringRecord::new();
|
||||||
@ -61,8 +64,7 @@ fn index(
|
|||||||
|
|
||||||
while !end_of_file {
|
while !end_of_file {
|
||||||
let tokenizer_builder = DefaultBuilder::new();
|
let tokenizer_builder = DefaultBuilder::new();
|
||||||
let update_path = tempfile::NamedTempFile::new()?;
|
let mut update = database.start_update("default")?;
|
||||||
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||||
@ -88,10 +90,8 @@ fn index(
|
|||||||
|
|
||||||
println!();
|
println!();
|
||||||
|
|
||||||
println!("building update...");
|
println!("committing update...");
|
||||||
let update = update.build()?;
|
database.commit_update(update)?;
|
||||||
println!("ingesting update...");
|
|
||||||
database.ingest_update_file(update)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(database)
|
Ok(database)
|
||||||
@ -125,14 +125,13 @@ fn main() -> Result<(), Box<Error>> {
|
|||||||
None => HashSet::new(),
|
None => HashSet::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let (elapsed, result) = elapsed::measure_time(|| {
|
let start = Instant::now();
|
||||||
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
|
let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words);
|
||||||
});
|
|
||||||
|
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
return Err(e.into())
|
return Err(e.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("database created in {} at: {:?}", elapsed, opt.database_path);
|
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|||||||
use std::collections::btree_map::{BTreeMap, Entry};
|
use std::collections::btree_map::{BTreeMap, Entry};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
|
use std::time::Instant;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
@ -27,6 +28,10 @@ pub struct Opt {
|
|||||||
/// The number of returned results
|
/// The number of returned results
|
||||||
#[structopt(short = "n", long = "number-results", default_value = "10")]
|
#[structopt(short = "n", long = "number-results", default_value = "10")]
|
||||||
pub number_results: usize,
|
pub number_results: usize,
|
||||||
|
|
||||||
|
/// The number of characters before and after the first match
|
||||||
|
#[structopt(short = "C", long = "context", default_value = "35")]
|
||||||
|
pub char_context: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
type Document = HashMap<String, String>;
|
type Document = HashMap<String, String>;
|
||||||
@ -66,16 +71,12 @@ fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize)
|
|||||||
(byte_index, byte_length)
|
(byte_index, byte_length)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
|
fn create_highlight_areas(text: &str, matches: &[Match]) -> Vec<usize> {
|
||||||
let mut byte_indexes = BTreeMap::new();
|
let mut byte_indexes = BTreeMap::new();
|
||||||
|
|
||||||
for match_ in matches {
|
for match_ in matches {
|
||||||
let match_attribute = match_.attribute.attribute();
|
let char_index = match_.char_index as usize;
|
||||||
if SchemaAttr::new(match_attribute) == attribute {
|
let char_length = match_.char_length as usize;
|
||||||
let word_area = match_.word_area;
|
|
||||||
|
|
||||||
let char_index = word_area.char_index() as usize;
|
|
||||||
let char_length = word_area.length() as usize;
|
|
||||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||||
|
|
||||||
match byte_indexes.entry(byte_index) {
|
match byte_indexes.entry(byte_index) {
|
||||||
@ -87,7 +88,6 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
let mut title_areas = Vec::new();
|
let mut title_areas = Vec::new();
|
||||||
title_areas.push(0);
|
title_areas.push(0);
|
||||||
@ -100,13 +100,46 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
|
|||||||
title_areas
|
title_areas
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
|
||||||
|
///
|
||||||
|
/// ```no_run
|
||||||
|
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||||
|
///
|
||||||
|
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
|
||||||
|
///
|
||||||
|
/// let (text, matches) = crop_text(&text, matches, 35);
|
||||||
|
/// ```
|
||||||
|
fn crop_text(
|
||||||
|
text: &str,
|
||||||
|
matches: impl IntoIterator<Item=Match>,
|
||||||
|
context: usize,
|
||||||
|
) -> (String, Vec<Match>)
|
||||||
|
{
|
||||||
|
let mut matches = matches.into_iter().peekable();
|
||||||
|
|
||||||
|
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
|
||||||
|
let start = char_index.saturating_sub(context);
|
||||||
|
let text = text.chars().skip(start).take(context * 2).collect();
|
||||||
|
|
||||||
|
let matches = matches
|
||||||
|
.take_while(|m| {
|
||||||
|
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
||||||
|
})
|
||||||
|
.map(|match_| {
|
||||||
|
Match { char_index: match_.char_index - start as u32, ..match_ }
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
(text, matches)
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> Result<(), Box<Error>> {
|
fn main() -> Result<(), Box<Error>> {
|
||||||
let _ = env_logger::init();
|
let _ = env_logger::init();
|
||||||
let opt = Opt::from_args();
|
let opt = Opt::from_args();
|
||||||
|
|
||||||
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
|
let start = Instant::now();
|
||||||
let database = result?;
|
let database = Database::open(&opt.database_path)?;
|
||||||
println!("database prepared for you in {}", elapsed);
|
println!("database prepared for you in {:.2?}", start.elapsed());
|
||||||
|
|
||||||
let mut buffer = String::new();
|
let mut buffer = String::new();
|
||||||
let input = io::stdin();
|
let input = io::stdin();
|
||||||
@ -118,16 +151,19 @@ fn main() -> Result<(), Box<Error>> {
|
|||||||
if input.read_line(&mut buffer)? == 0 { break }
|
if input.read_line(&mut buffer)? == 0 { break }
|
||||||
let query = buffer.trim_end_matches('\n');
|
let query = buffer.trim_end_matches('\n');
|
||||||
|
|
||||||
let view = database.view();
|
let view = database.view("default")?;
|
||||||
let schema = view.schema();
|
let schema = view.schema();
|
||||||
|
|
||||||
let (elapsed, documents) = elapsed::measure_time(|| {
|
let start = Instant::now();
|
||||||
let builder = view.query_builder().unwrap();
|
|
||||||
builder.query(query, 0..opt.number_results)
|
let builder = view.query_builder();
|
||||||
});
|
let documents = builder.query(query, 0..opt.number_results);
|
||||||
|
|
||||||
let number_of_documents = documents.len();
|
let number_of_documents = documents.len();
|
||||||
for doc in documents {
|
for mut doc in documents {
|
||||||
|
|
||||||
|
doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index));
|
||||||
|
|
||||||
match view.document_by_id::<Document>(doc.id) {
|
match view.document_by_id::<Document>(doc.id) {
|
||||||
Ok(document) => {
|
Ok(document) => {
|
||||||
for name in &opt.displayed_fields {
|
for name in &opt.displayed_fields {
|
||||||
@ -141,7 +177,11 @@ fn main() -> Result<(), Box<Error>> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
print!("{}: ", name);
|
print!("{}: ", name);
|
||||||
let areas = create_highlight_areas(&text, &doc.matches, attr);
|
let matches = doc.matches.iter()
|
||||||
|
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||||
|
.cloned();
|
||||||
|
let (text, matches) = crop_text(&text, matches, opt.char_context);
|
||||||
|
let areas = create_highlight_areas(&text, &matches);
|
||||||
display_highlights(&text, &areas)?;
|
display_highlights(&text, &areas)?;
|
||||||
println!();
|
println!();
|
||||||
}
|
}
|
||||||
@ -151,7 +191,7 @@ fn main() -> Result<(), Box<Error>> {
|
|||||||
|
|
||||||
let mut matching_attributes = HashSet::new();
|
let mut matching_attributes = HashSet::new();
|
||||||
for _match in doc.matches {
|
for _match in doc.matches {
|
||||||
let attr = SchemaAttr::new(_match.attribute.attribute());
|
let attr = SchemaAttr::new(_match.attribute);
|
||||||
let name = schema.attribute_name(attr);
|
let name = schema.attribute_name(attr);
|
||||||
matching_attributes.insert(name);
|
matching_attributes.insert(name);
|
||||||
}
|
}
|
||||||
@ -162,7 +202,7 @@ fn main() -> Result<(), Box<Error>> {
|
|||||||
println!();
|
println!();
|
||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
|
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed());
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
105
src/attribute.rs
105
src/attribute.rs
@ -1,105 +0,0 @@
|
|||||||
use std::fmt;
|
|
||||||
|
|
||||||
/// Represent an attribute number along with the word index
|
|
||||||
/// according to the tokenizer used.
|
|
||||||
///
|
|
||||||
/// It can accept up to 1024 attributes and word positions
|
|
||||||
/// can be maximum 2^22.
|
|
||||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
||||||
pub struct Attribute(u32);
|
|
||||||
|
|
||||||
impl Attribute {
|
|
||||||
/// Construct an `Attribute` from an attribute number and
|
|
||||||
/// the word position of a match according to the tokenizer used.
|
|
||||||
pub(crate) fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
|
|
||||||
if attribute & 0b1111_1100_0000_0000 != 0 {
|
|
||||||
return Err(AttributeError::AttributeTooBig)
|
|
||||||
}
|
|
||||||
|
|
||||||
if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
|
||||||
return Err(AttributeError::IndexTooBig)
|
|
||||||
}
|
|
||||||
|
|
||||||
let attribute = u32::from(attribute) << 22;
|
|
||||||
Ok(Attribute(attribute | index))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Construct an `Attribute` from an attribute number and
|
|
||||||
/// the word position of a match according to the tokenizer used.
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// The attribute must not be greater than 1024
|
|
||||||
/// and the word index not greater than 2^22.
|
|
||||||
pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute {
|
|
||||||
match Attribute::new(attribute, index) {
|
|
||||||
Ok(attribute) => attribute,
|
|
||||||
Err(AttributeError::AttributeTooBig) => {
|
|
||||||
panic!("attribute must not be greater than 1024")
|
|
||||||
},
|
|
||||||
Err(AttributeError::IndexTooBig) => {
|
|
||||||
panic!("attribute word index must not be greater than 2^22")
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn max_value() -> Attribute {
|
|
||||||
Attribute(u32::max_value())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn attribute(self) -> u16 {
|
|
||||||
(self.0 >> 22) as u16
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn word_index(self) -> u32 {
|
|
||||||
self.0 & 0b0000_0000_0011_1111_1111_1111_1111
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for Attribute {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
f.debug_struct("Attribute")
|
|
||||||
.field("attribute", &self.attribute())
|
|
||||||
.field("word_index", &self.word_index())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum AttributeError {
|
|
||||||
AttributeTooBig,
|
|
||||||
IndexTooBig,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use quickcheck::{quickcheck, TestResult};
|
|
||||||
|
|
||||||
quickcheck! {
|
|
||||||
fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
|
|
||||||
if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
|
|
||||||
return TestResult::discard()
|
|
||||||
}
|
|
||||||
|
|
||||||
let attribute = Attribute::new_faillible(gen_attr, gen_index);
|
|
||||||
|
|
||||||
let valid_attribute = attribute.attribute() == gen_attr;
|
|
||||||
let valid_index = attribute.word_index() == gen_index;
|
|
||||||
|
|
||||||
TestResult::from_bool(valid_attribute && valid_index)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
|
|
||||||
if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
|
|
||||||
return TestResult::discard()
|
|
||||||
}
|
|
||||||
|
|
||||||
let a = Attribute::new_faillible(gen_attr, gen_index);
|
|
||||||
let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
|
|
||||||
|
|
||||||
TestResult::from_bool(a < b)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,12 +1,15 @@
|
|||||||
use std::io::{self, Cursor, BufRead};
|
|
||||||
use std::slice::from_raw_parts;
|
use std::slice::from_raw_parts;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use sdset::Set;
|
use sdset::Set;
|
||||||
|
|
||||||
use crate::DocumentId;
|
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||||
|
use crate::write_to_bytes::WriteToBytes;
|
||||||
use crate::data::SharedData;
|
use crate::data::SharedData;
|
||||||
|
use crate::DocumentId;
|
||||||
|
|
||||||
use super::into_u8_slice;
|
use super::into_u8_slice;
|
||||||
|
|
||||||
#[derive(Default, Clone)]
|
#[derive(Default, Clone)]
|
||||||
@ -19,21 +22,6 @@ impl DocIds {
|
|||||||
DocIds(data)
|
DocIds(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
|
|
||||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
|
||||||
let offset = cursor.position() as usize;
|
|
||||||
let doc_ids = cursor.get_ref().range(offset, len);
|
|
||||||
cursor.consume(len);
|
|
||||||
|
|
||||||
Ok(DocIds(doc_ids))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
|
||||||
let len = self.0.len() as u64;
|
|
||||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
|
||||||
bytes.extend_from_slice(&self.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.0.is_empty()
|
self.0.is_empty()
|
||||||
}
|
}
|
||||||
@ -52,3 +40,22 @@ impl AsRef<Set<DocumentId>> for DocIds {
|
|||||||
Set::new_unchecked(slice)
|
Set::new_unchecked(slice)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl FromSharedDataCursor for DocIds {
|
||||||
|
type Error = Box<Error>;
|
||||||
|
|
||||||
|
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> {
|
||||||
|
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||||
|
let data = cursor.extract(len);
|
||||||
|
|
||||||
|
Ok(DocIds(data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WriteToBytes for DocIds {
|
||||||
|
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||||
|
let len = self.0.len() as u64;
|
||||||
|
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||||
|
bytes.extend_from_slice(&self.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
use std::io::{self, Write, Cursor, BufRead};
|
use std::io::{self, Write};
|
||||||
use std::slice::from_raw_parts;
|
use std::slice::from_raw_parts;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
use std::ops::Index;
|
use std::ops::Index;
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use sdset::Set;
|
use sdset::Set;
|
||||||
|
|
||||||
use crate::DocIndex;
|
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||||
|
use crate::write_to_bytes::WriteToBytes;
|
||||||
use crate::data::SharedData;
|
use crate::data::SharedData;
|
||||||
|
use crate::DocIndex;
|
||||||
|
|
||||||
use super::into_u8_slice;
|
use super::into_u8_slice;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -25,38 +27,6 @@ pub struct DocIndexes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DocIndexes {
|
impl DocIndexes {
|
||||||
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
|
|
||||||
let bytes = Arc::new(bytes);
|
|
||||||
let len = bytes.len();
|
|
||||||
let data = SharedData::new(bytes, 0, len);
|
|
||||||
let mut cursor = Cursor::new(data);
|
|
||||||
DocIndexes::from_cursor(&mut cursor)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
|
|
||||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
|
||||||
let offset = cursor.position() as usize;
|
|
||||||
let ranges = cursor.get_ref().range(offset, len);
|
|
||||||
cursor.consume(len);
|
|
||||||
|
|
||||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
|
||||||
let offset = cursor.position() as usize;
|
|
||||||
let indexes = cursor.get_ref().range(offset, len);
|
|
||||||
cursor.consume(len);
|
|
||||||
|
|
||||||
Ok(DocIndexes { ranges, indexes })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
|
||||||
let ranges_len = self.ranges.len() as u64;
|
|
||||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
|
||||||
bytes.extend_from_slice(&self.ranges);
|
|
||||||
|
|
||||||
let indexes_len = self.indexes.len() as u64;
|
|
||||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
|
||||||
bytes.extend_from_slice(&self.indexes);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||||
self.ranges().get(index).map(|Range { start, end }| {
|
self.ranges().get(index).map(|Range { start, end }| {
|
||||||
let start = *start as usize;
|
let start = *start as usize;
|
||||||
@ -92,6 +62,32 @@ impl Index<usize> for DocIndexes {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl FromSharedDataCursor for DocIndexes {
|
||||||
|
type Error = io::Error;
|
||||||
|
|
||||||
|
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> {
|
||||||
|
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||||
|
let ranges = cursor.extract(len);
|
||||||
|
|
||||||
|
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||||
|
let indexes = cursor.extract(len);
|
||||||
|
|
||||||
|
Ok(DocIndexes { ranges, indexes })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WriteToBytes for DocIndexes {
|
||||||
|
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||||
|
let ranges_len = self.ranges.len() as u64;
|
||||||
|
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||||
|
bytes.extend_from_slice(&self.ranges);
|
||||||
|
|
||||||
|
let indexes_len = self.indexes.len() as u64;
|
||||||
|
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||||
|
bytes.extend_from_slice(&self.indexes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct DocIndexesBuilder<W> {
|
pub struct DocIndexesBuilder<W> {
|
||||||
ranges: Vec<Range>,
|
ranges: Vec<Range>,
|
||||||
indexes: Vec<DocIndex>,
|
indexes: Vec<DocIndex>,
|
||||||
@ -147,29 +143,32 @@ impl<W: Write> DocIndexesBuilder<W> {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
|
||||||
|
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use crate::{Attribute, WordArea};
|
|
||||||
|
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||||
let a = DocIndex {
|
let a = DocIndex {
|
||||||
document_id: DocumentId(0),
|
document_id: DocumentId(0),
|
||||||
attribute: Attribute::new_faillible(3, 11),
|
attribute: 3,
|
||||||
word_area: WordArea::new_faillible(30, 4)
|
word_index: 11,
|
||||||
|
char_index: 30,
|
||||||
|
char_length: 4,
|
||||||
};
|
};
|
||||||
let b = DocIndex {
|
let b = DocIndex {
|
||||||
document_id: DocumentId(1),
|
document_id: DocumentId(1),
|
||||||
attribute: Attribute::new_faillible(4, 21),
|
attribute: 4,
|
||||||
word_area: WordArea::new_faillible(35, 6)
|
word_index: 21,
|
||||||
|
char_index: 35,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
let c = DocIndex {
|
let c = DocIndex {
|
||||||
document_id: DocumentId(2),
|
document_id: DocumentId(2),
|
||||||
attribute: Attribute::new_faillible(8, 2),
|
attribute: 8,
|
||||||
word_area: WordArea::new_faillible(89, 6)
|
word_index: 2,
|
||||||
|
char_index: 89,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut builder = DocIndexesBuilder::memory();
|
let mut builder = DocIndexesBuilder::memory();
|
||||||
@ -193,18 +192,24 @@ mod tests {
|
|||||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||||
let a = DocIndex {
|
let a = DocIndex {
|
||||||
document_id: DocumentId(0),
|
document_id: DocumentId(0),
|
||||||
attribute: Attribute::new_faillible(3, 11),
|
attribute: 3,
|
||||||
word_area: WordArea::new_faillible(30, 4)
|
word_index: 11,
|
||||||
|
char_index: 30,
|
||||||
|
char_length: 4,
|
||||||
};
|
};
|
||||||
let b = DocIndex {
|
let b = DocIndex {
|
||||||
document_id: DocumentId(1),
|
document_id: DocumentId(1),
|
||||||
attribute: Attribute::new_faillible(4, 21),
|
attribute: 4,
|
||||||
word_area: WordArea::new_faillible(35, 6)
|
word_index: 21,
|
||||||
|
char_index: 35,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
let c = DocIndex {
|
let c = DocIndex {
|
||||||
document_id: DocumentId(2),
|
document_id: DocumentId(2),
|
||||||
attribute: Attribute::new_faillible(8, 2),
|
attribute: 8,
|
||||||
word_area: WordArea::new_faillible(89, 6)
|
word_index: 2,
|
||||||
|
char_index: 89,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut builder = DocIndexesBuilder::memory();
|
let mut builder = DocIndexesBuilder::memory();
|
||||||
|
@ -1,55 +1,13 @@
|
|||||||
mod doc_ids;
|
mod doc_ids;
|
||||||
mod doc_indexes;
|
mod doc_indexes;
|
||||||
|
mod shared_data;
|
||||||
|
|
||||||
use std::slice::from_raw_parts;
|
use std::slice::from_raw_parts;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
use std::ops::Deref;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
pub use self::doc_ids::DocIds;
|
pub use self::doc_ids::DocIds;
|
||||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||||
|
pub use self::shared_data::SharedData;
|
||||||
#[derive(Default, Clone)]
|
|
||||||
pub struct SharedData {
|
|
||||||
pub bytes: Arc<Vec<u8>>,
|
|
||||||
pub offset: usize,
|
|
||||||
pub len: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SharedData {
|
|
||||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
|
||||||
let len = vec.len();
|
|
||||||
let bytes = Arc::new(vec);
|
|
||||||
SharedData::new(bytes, 0, len)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
|
||||||
SharedData { bytes, offset, len }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
|
||||||
assert!(offset + len <= self.len);
|
|
||||||
SharedData {
|
|
||||||
bytes: self.bytes.clone(),
|
|
||||||
offset: self.offset + offset,
|
|
||||||
len: len,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Deref for SharedData {
|
|
||||||
type Target = [u8];
|
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
|
||||||
self.as_ref()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AsRef<[u8]> for SharedData {
|
|
||||||
fn as_ref(&self) -> &[u8] {
|
|
||||||
&self.bytes[self.offset..self.offset + self.len]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||||
let ptr = slice.as_ptr() as *const u8;
|
let ptr = slice.as_ptr() as *const u8;
|
||||||
|
48
src/data/shared_data.rs
Normal file
48
src/data/shared_data.rs
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
use std::ops::Deref;
|
||||||
|
|
||||||
|
#[derive(Default, Clone)]
|
||||||
|
pub struct SharedData {
|
||||||
|
pub bytes: Arc<Vec<u8>>,
|
||||||
|
pub offset: usize,
|
||||||
|
pub len: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SharedData {
|
||||||
|
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||||
|
let len = vec.len();
|
||||||
|
let bytes = Arc::from(vec);
|
||||||
|
SharedData::new(bytes, 0, len)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||||
|
SharedData { bytes, offset, len }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_slice(&self) -> &[u8] {
|
||||||
|
&self.bytes[self.offset..self.offset + self.len]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||||
|
assert!(offset + len <= self.len);
|
||||||
|
SharedData {
|
||||||
|
bytes: self.bytes.clone(),
|
||||||
|
offset: self.offset + offset,
|
||||||
|
len: len,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Deref for SharedData {
|
||||||
|
type Target = [u8];
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
self.as_slice()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AsRef<[u8]> for SharedData {
|
||||||
|
fn as_ref(&self) -> &[u8] {
|
||||||
|
self.as_slice()
|
||||||
|
}
|
||||||
|
}
|
46
src/database/config.rs
Normal file
46
src/database/config.rs
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
use std::collections::{HashSet, HashMap};
|
||||||
|
use serde_derive::{Serialize, Deserialize};
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum RankingOrdering {
|
||||||
|
Asc,
|
||||||
|
Dsc
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub struct AccessToken {
|
||||||
|
pub read_key: String,
|
||||||
|
pub write_key: String,
|
||||||
|
pub admin_key: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub struct Config {
|
||||||
|
pub stop_words: Option<HashSet<String>>,
|
||||||
|
pub ranking_order: Option<Vec<String>>,
|
||||||
|
pub distinct_field: Option<String>,
|
||||||
|
pub ranking_rules: Option<HashMap<String, RankingOrdering>>,
|
||||||
|
pub access_token: Option<AccessToken>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Config {
|
||||||
|
pub fn update_with(&mut self, new: Config) {
|
||||||
|
if let Some(stop_words) = new.stop_words {
|
||||||
|
self.stop_words = Some(stop_words);
|
||||||
|
};
|
||||||
|
if let Some(ranking_order) = new.ranking_order {
|
||||||
|
self.ranking_order = Some(ranking_order);
|
||||||
|
};
|
||||||
|
if let Some(distinct_field) = new.distinct_field {
|
||||||
|
self.distinct_field = Some(distinct_field);
|
||||||
|
};
|
||||||
|
if let Some(ranking_rules) = new.ranking_rules {
|
||||||
|
self.ranking_rules = Some(ranking_rules);
|
||||||
|
};
|
||||||
|
if let Some(access_token) = new.access_token {
|
||||||
|
self.access_token = Some(access_token);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
@ -38,6 +38,10 @@ impl DocumentKey {
|
|||||||
DocumentKeyAttr::new(self.document_id(), attr)
|
DocumentKeyAttr::new(self.document_id(), attr)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn with_attribute_min(&self) -> DocumentKeyAttr {
|
||||||
|
DocumentKeyAttr::new(self.document_id(), SchemaAttr::min())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
|
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
|
||||||
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
|
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
|
||||||
}
|
}
|
||||||
|
@ -1,60 +1,45 @@
|
|||||||
use std::io::{Write, BufRead, Cursor};
|
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use fst::{map, Map, Streamer, IntoStreamer};
|
use fst::{map, Map, IntoStreamer, Streamer};
|
||||||
use sdset::{Set, SetOperation};
|
|
||||||
use sdset::duo::Union;
|
|
||||||
use fst::raw::Fst;
|
use fst::raw::Fst;
|
||||||
|
use sdset::duo::{Union, DifferenceByKey};
|
||||||
|
use sdset::{Set, SetOperation};
|
||||||
|
|
||||||
|
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||||
|
use crate::write_to_bytes::WriteToBytes;
|
||||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||||
use crate::data::SharedData;
|
use crate::{DocumentId, DocIndex};
|
||||||
use crate::DocIndex;
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct Positive {
|
pub struct Index {
|
||||||
map: Map,
|
pub map: Map,
|
||||||
indexes: DocIndexes,
|
pub indexes: DocIndexes,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Positive {
|
impl Index {
|
||||||
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
|
pub fn remove_documents(&self, documents: &Set<DocumentId>) -> Index {
|
||||||
Positive { map, indexes }
|
let mut buffer = Vec::new();
|
||||||
|
let mut builder = IndexBuilder::new();
|
||||||
|
let mut stream = self.into_stream();
|
||||||
|
|
||||||
|
while let Some((key, indexes)) = stream.next() {
|
||||||
|
buffer.clear();
|
||||||
|
|
||||||
|
let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x);
|
||||||
|
op.extend_vec(&mut buffer);
|
||||||
|
|
||||||
|
if !buffer.is_empty() {
|
||||||
|
let indexes = Set::new_unchecked(&buffer);
|
||||||
|
builder.insert(key, indexes).unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
|
builder.build()
|
||||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
|
||||||
let offset = cursor.position() as usize;
|
|
||||||
let data = cursor.get_ref().range(offset, len);
|
|
||||||
|
|
||||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
|
||||||
let map = Map::from(fst);
|
|
||||||
cursor.consume(len);
|
|
||||||
|
|
||||||
let indexes = DocIndexes::from_cursor(cursor)?;
|
|
||||||
|
|
||||||
Ok(Positive { map, indexes})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
pub fn union(&self, other: &Index) -> Index {
|
||||||
let slice = self.map.as_fst().as_bytes();
|
let mut builder = IndexBuilder::new();
|
||||||
let len = slice.len() as u64;
|
|
||||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
|
||||||
bytes.extend_from_slice(slice);
|
|
||||||
|
|
||||||
self.indexes.write_to_bytes(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn map(&self) -> &Map {
|
|
||||||
&self.map
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn indexes(&self) -> &DocIndexes {
|
|
||||||
&self.indexes
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
|
|
||||||
let mut builder = PositiveBuilder::memory();
|
|
||||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
@ -63,19 +48,19 @@ impl Positive {
|
|||||||
match ivalues {
|
match ivalues {
|
||||||
[a, b] => {
|
[a, b] => {
|
||||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
let indexes = &indexes[a.value as usize];
|
||||||
let a = Set::new_unchecked(indexes);
|
let a = Set::new_unchecked(indexes);
|
||||||
|
|
||||||
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
|
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
|
||||||
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
|
let indexes = &indexes[b.value as usize];
|
||||||
let b = Set::new_unchecked(indexes);
|
let b = Set::new_unchecked(indexes);
|
||||||
|
|
||||||
let op = Union::new(a, b);
|
let op = Union::new(a, b);
|
||||||
op.extend_vec(&mut buffer);
|
op.extend_vec(&mut buffer);
|
||||||
},
|
},
|
||||||
[a] => {
|
[x] => {
|
||||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
let indexes = if x.index == 0 { &self.indexes } else { &other.indexes };
|
||||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
let indexes = &indexes[x.value as usize];
|
||||||
buffer.extend_from_slice(indexes)
|
buffer.extend_from_slice(indexes)
|
||||||
},
|
},
|
||||||
_ => continue,
|
_ => continue,
|
||||||
@ -83,23 +68,45 @@ impl Positive {
|
|||||||
|
|
||||||
if !buffer.is_empty() {
|
if !buffer.is_empty() {
|
||||||
let indexes = Set::new_unchecked(&buffer);
|
let indexes = Set::new_unchecked(&buffer);
|
||||||
builder.insert(key, indexes)?;
|
builder.insert(key, indexes).unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let (map, indexes) = builder.into_inner()?;
|
builder.build()
|
||||||
let map = Map::from_bytes(map)?;
|
|
||||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
|
||||||
Ok(Positive { map, indexes })
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
|
impl FromSharedDataCursor for Index {
|
||||||
|
type Error = Box<Error>;
|
||||||
|
|
||||||
|
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Index, Self::Error> {
|
||||||
|
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||||
|
let data = cursor.extract(len);
|
||||||
|
|
||||||
|
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||||
|
let map = Map::from(fst);
|
||||||
|
|
||||||
|
let indexes = DocIndexes::from_shared_data_cursor(cursor)?;
|
||||||
|
|
||||||
|
Ok(Index { map, indexes})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WriteToBytes for Index {
|
||||||
|
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||||
|
let slice = self.map.as_fst().as_bytes();
|
||||||
|
let len = slice.len() as u64;
|
||||||
|
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||||
|
bytes.extend_from_slice(slice);
|
||||||
|
|
||||||
|
self.indexes.write_to_bytes(bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'m, 'a> IntoStreamer<'a> for &'m Index {
|
||||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||||
/// The type of the stream to be constructed.
|
|
||||||
type Into = Stream<'m>;
|
type Into = Stream<'m>;
|
||||||
|
|
||||||
/// Construct a stream from `Self`.
|
|
||||||
fn into_stream(self) -> Self::Into {
|
fn into_stream(self) -> Self::Into {
|
||||||
Stream {
|
Stream {
|
||||||
map_stream: self.map.into_stream(),
|
map_stream: self.map.into_stream(),
|
||||||
@ -128,28 +135,26 @@ impl<'m, 'a> Streamer<'a> for Stream<'m> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct PositiveBuilder<W, X> {
|
pub struct IndexBuilder {
|
||||||
map: fst::MapBuilder<W>,
|
map: fst::MapBuilder<Vec<u8>>,
|
||||||
indexes: DocIndexesBuilder<X>,
|
indexes: DocIndexesBuilder<Vec<u8>>,
|
||||||
value: u64,
|
value: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
|
impl IndexBuilder {
|
||||||
pub fn memory() -> Self {
|
pub fn new() -> Self {
|
||||||
PositiveBuilder {
|
IndexBuilder {
|
||||||
map: fst::MapBuilder::memory(),
|
map: fst::MapBuilder::memory(),
|
||||||
indexes: DocIndexesBuilder::memory(),
|
indexes: DocIndexesBuilder::memory(),
|
||||||
value: 0,
|
value: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
|
||||||
/// If a key is inserted that is less than or equal to any previous key added,
|
/// If a key is inserted that is less than or equal to any previous key added,
|
||||||
/// then an error is returned. Similarly, if there was a problem writing
|
/// then an error is returned. Similarly, if there was a problem writing
|
||||||
/// to the underlying writer, an error is returned.
|
/// to the underlying writer, an error is returned.
|
||||||
// FIXME what if one write doesn't work but the other do ?
|
// FIXME what if one write doesn't work but the other do ?
|
||||||
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> fst::Result<()>
|
||||||
where K: AsRef<[u8]>,
|
where K: AsRef<[u8]>,
|
||||||
{
|
{
|
||||||
self.map.insert(key, self.value)?;
|
self.map.insert(key, self.value)?;
|
||||||
@ -158,9 +163,13 @@ impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
pub fn build(self) -> Index {
|
||||||
let map = self.map.into_inner()?;
|
let map = self.map.into_inner().unwrap();
|
||||||
let indexes = self.indexes.into_inner()?;
|
let indexes = self.indexes.into_inner().unwrap();
|
||||||
Ok((map, indexes))
|
|
||||||
|
let map = Map::from_bytes(map).unwrap();
|
||||||
|
let indexes = DocIndexes::from_bytes(indexes).unwrap();
|
||||||
|
|
||||||
|
Index { map, indexes }
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,82 +0,0 @@
|
|||||||
mod negative;
|
|
||||||
mod positive;
|
|
||||||
|
|
||||||
pub(crate) use self::negative::Negative;
|
|
||||||
pub(crate) use self::positive::{Positive, PositiveBuilder};
|
|
||||||
|
|
||||||
use std::error::Error;
|
|
||||||
use std::io::Cursor;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
|
||||||
use sdset::duo::DifferenceByKey;
|
|
||||||
use sdset::{Set, SetOperation};
|
|
||||||
use fst::Map;
|
|
||||||
|
|
||||||
use crate::data::{SharedData, DocIndexes};
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct Index {
|
|
||||||
pub(crate) negative: Negative,
|
|
||||||
pub(crate) positive: Positive,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Index {
|
|
||||||
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
|
|
||||||
let len = bytes.len();
|
|
||||||
Index::from_shared_bytes(Arc::new(bytes), 0, len)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_shared_bytes(
|
|
||||||
bytes: Arc<Vec<u8>>,
|
|
||||||
offset: usize,
|
|
||||||
len: usize,
|
|
||||||
) -> Result<Index, Box<Error>>
|
|
||||||
{
|
|
||||||
let data = SharedData::new(bytes, offset, len);
|
|
||||||
let mut cursor = Cursor::new(data);
|
|
||||||
|
|
||||||
let negative = Negative::from_cursor(&mut cursor)?;
|
|
||||||
let positive = Positive::from_cursor(&mut cursor)?;
|
|
||||||
Ok(Index { negative, positive })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
|
||||||
self.negative.write_to_bytes(bytes);
|
|
||||||
self.positive.write_to_bytes(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
|
|
||||||
if other.negative.is_empty() {
|
|
||||||
let negative = Negative::default();
|
|
||||||
let positive = self.positive.union(&other.positive)?;
|
|
||||||
return Ok(Index { negative, positive })
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
let mut builder = PositiveBuilder::memory();
|
|
||||||
let mut stream = self.positive.into_stream();
|
|
||||||
while let Some((key, indexes)) = stream.next() {
|
|
||||||
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
|
|
||||||
|
|
||||||
buffer.clear();
|
|
||||||
op.extend_vec(&mut buffer);
|
|
||||||
|
|
||||||
if !buffer.is_empty() {
|
|
||||||
let indexes = Set::new_unchecked(&buffer);
|
|
||||||
builder.insert(key, indexes)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let positive = {
|
|
||||||
let (map, indexes) = builder.into_inner()?;
|
|
||||||
let map = Map::from_bytes(map)?;
|
|
||||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
|
||||||
Positive::new(map, indexes)
|
|
||||||
};
|
|
||||||
|
|
||||||
let negative = Negative::default();
|
|
||||||
let positive = positive.union(&other.positive)?;
|
|
||||||
Ok(Index { negative, positive })
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,43 +0,0 @@
|
|||||||
use std::error::Error;
|
|
||||||
use std::io::Cursor;
|
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use sdset::Set;
|
|
||||||
use byteorder::{LittleEndian, WriteBytesExt};
|
|
||||||
|
|
||||||
use crate::data::SharedData;
|
|
||||||
use crate::data::DocIds;
|
|
||||||
use crate::DocumentId;
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct Negative(DocIds);
|
|
||||||
|
|
||||||
impl Negative {
|
|
||||||
pub fn new(doc_ids: DocIds) -> Negative {
|
|
||||||
Negative(doc_ids)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
|
|
||||||
let doc_ids = DocIds::from_cursor(cursor)?;
|
|
||||||
Ok(Negative(doc_ids))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
|
||||||
let slice = self.0.as_bytes();
|
|
||||||
let len = slice.len() as u64;
|
|
||||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
|
||||||
bytes.extend_from_slice(slice);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
|
||||||
self.0.is_empty()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Deref for Negative {
|
|
||||||
type Target = Set<DocumentId>;
|
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
|
||||||
self.0.as_ref()
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,27 +1,48 @@
|
|||||||
use std::sync::{Arc, Mutex};
|
use std::time::Instant;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::ops::Deref;
|
use std::ffi::OsStr;
|
||||||
use std::path::Path;
|
use std::sync::Arc;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::ops::{Deref, DerefMut};
|
||||||
|
|
||||||
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
|
use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions};
|
||||||
use rocksdb::rocksdb::{Writable, Snapshot};
|
use rocksdb::rocksdb::{Writable, Snapshot};
|
||||||
use rocksdb::{DB, DBVector, MergeOperands};
|
use rocksdb::{DB, MergeOperands};
|
||||||
use crossbeam::atomic::ArcCell;
|
use size_format::SizeFormatterBinary;
|
||||||
use log::debug;
|
use arc_swap::ArcSwap;
|
||||||
|
use lockfree::map::Map;
|
||||||
|
use hashbrown::HashMap;
|
||||||
|
use log::{info, error, warn};
|
||||||
|
|
||||||
|
use crate::database::schema::SchemaAttr;
|
||||||
|
use crate::shared_data_cursor::FromSharedDataCursor;
|
||||||
|
use crate::write_to_bytes::WriteToBytes;
|
||||||
|
use crate::DocumentId;
|
||||||
|
|
||||||
|
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
|
||||||
|
|
||||||
|
pub use self::config::Config;
|
||||||
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
|
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
|
||||||
pub use self::view::{DatabaseView, DocumentIter};
|
pub use self::view::{DatabaseView, DocumentIter};
|
||||||
pub use self::update::{Update, UpdateBuilder};
|
pub use self::update::Update;
|
||||||
pub use self::serde::SerializerError;
|
pub use self::serde::SerializerError;
|
||||||
pub use self::schema::Schema;
|
pub use self::schema::Schema;
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
|
pub use self::number::{Number, ParseNumberError};
|
||||||
|
|
||||||
|
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
|
||||||
|
|
||||||
const DATA_INDEX: &[u8] = b"data-index";
|
const DATA_INDEX: &[u8] = b"data-index";
|
||||||
|
const DATA_RANKED_MAP: &[u8] = b"data-ranked-map";
|
||||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||||
|
const CONFIG: &[u8] = b"config";
|
||||||
|
|
||||||
|
pub mod config;
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub(crate) mod index;
|
pub(crate) mod index;
|
||||||
mod deserializer;
|
mod number;
|
||||||
mod document_key;
|
mod document_key;
|
||||||
mod serde;
|
mod serde;
|
||||||
mod update;
|
mod update;
|
||||||
@ -39,64 +60,150 @@ where D: Deref<Target=DB>
|
|||||||
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
|
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
|
||||||
where D: Deref<Target=DB>
|
where D: Deref<Target=DB>
|
||||||
{
|
{
|
||||||
let index = match snapshot.get(DATA_INDEX)? {
|
let start = Instant::now();
|
||||||
|
let vector = snapshot.get(DATA_INDEX)?;
|
||||||
|
info!("loading index from kv-store took {:.2?}", start.elapsed());
|
||||||
|
|
||||||
|
match vector {
|
||||||
Some(vector) => {
|
Some(vector) => {
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
let bytes = vector.as_ref().to_vec();
|
let bytes = vector.as_ref().to_vec();
|
||||||
Index::from_bytes(bytes)?
|
info!("index size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
|
||||||
},
|
|
||||||
None => Index::default(),
|
let event = ReadIndexEvent::from_bytes(bytes)?;
|
||||||
};
|
let index = event.updated_documents().expect("BUG: invalid event deserialized");
|
||||||
|
|
||||||
|
info!("loading index from bytes took {:.2?}", start.elapsed());
|
||||||
|
|
||||||
Ok(index)
|
Ok(index)
|
||||||
|
},
|
||||||
|
None => Ok(Index::default()),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
fn retrieve_data_ranked_map<D>(snapshot: &Snapshot<D>) -> Result<RankedMap, Box<Error>>
|
||||||
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
|
where D: Deref<Target=DB>,
|
||||||
|
{
|
||||||
|
let start = Instant::now();
|
||||||
|
let vector = snapshot.get(DATA_RANKED_MAP)?;
|
||||||
|
info!("loading ranked map from kv-store took {:.2?}", start.elapsed());
|
||||||
|
|
||||||
let mut index: Option<Index> = None;
|
match vector {
|
||||||
|
Some(vector) => {
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let bytes = vector.as_ref().to_vec();
|
||||||
|
info!("ranked map size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
|
||||||
|
|
||||||
|
let event = ReadRankedMapEvent::from_bytes(bytes)?;
|
||||||
|
let ranked_map = event.updated_documents().expect("BUG: invalid event deserialized");
|
||||||
|
|
||||||
|
info!("loading ranked map from bytes took {:.2?}", start.elapsed());
|
||||||
|
|
||||||
|
Ok(ranked_map)
|
||||||
|
},
|
||||||
|
None => Ok(RankedMap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn retrieve_config<D>(snapshot: &Snapshot<D>) -> Result<Config, Box<Error>>
|
||||||
|
where D: Deref<Target=DB>,
|
||||||
|
{
|
||||||
|
match snapshot.get(CONFIG)? {
|
||||||
|
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||||
|
None => Ok(Config::default()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||||
|
use self::update::ReadIndexEvent::{self, *};
|
||||||
|
use self::update::WriteIndexEvent;
|
||||||
|
|
||||||
|
let mut index = Index::default();
|
||||||
for bytes in existing.into_iter().chain(operands) {
|
for bytes in existing.into_iter().chain(operands) {
|
||||||
let operand = Index::from_bytes(bytes.to_vec()).unwrap();
|
match ReadIndexEvent::from_bytes(bytes.to_vec()).unwrap() {
|
||||||
let merged = match index {
|
RemovedDocuments(d) => index = index.remove_documents(d.as_ref()),
|
||||||
Some(ref index) => index.merge(&operand).unwrap(),
|
UpdatedDocuments(i) => index = index.union(&i),
|
||||||
None => operand,
|
}
|
||||||
};
|
|
||||||
|
|
||||||
index.replace(merged);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let index = index.unwrap_or_default();
|
WriteIndexEvent::UpdatedDocuments(&index).into_bytes()
|
||||||
let mut bytes = Vec::new();
|
|
||||||
index.write_to_bytes(&mut bytes);
|
|
||||||
bytes
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Database {
|
fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||||
// DB is under a Mutex to sync update ingestions and separate DB update locking
|
use self::update::ReadRankedMapEvent::{self, *};
|
||||||
// and DatabaseView acquiring locking in other words:
|
use self::update::WriteRankedMapEvent;
|
||||||
// "Block readers the minimum possible amount of time"
|
|
||||||
db: Mutex<Arc<DB>>,
|
|
||||||
|
|
||||||
// This view is updated each time the DB ingests an update
|
let mut ranked_map = RankedMap::default();
|
||||||
view: ArcCell<DatabaseView<Arc<DB>>>,
|
for bytes in existing.into_iter().chain(operands) {
|
||||||
|
match ReadRankedMapEvent::from_bytes(bytes.to_vec()).unwrap() {
|
||||||
|
RemovedDocuments(d) => ranked_map.retain(|(k, _), _| !d.as_ref().binary_search(k).is_ok()),
|
||||||
|
UpdatedDocuments(i) => ranked_map.extend(i),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Database {
|
WriteRankedMapEvent::UpdatedDocuments(&ranked_map).into_bytes()
|
||||||
pub fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<Database, Box<Error>> {
|
}
|
||||||
|
|
||||||
|
fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||||
|
match key {
|
||||||
|
DATA_INDEX => merge_indexes(existing, operands),
|
||||||
|
DATA_RANKED_MAP => merge_ranked_maps(existing, operands),
|
||||||
|
key => panic!("The merge operator does not support merging {:?}", key),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct IndexUpdate {
|
||||||
|
index: String,
|
||||||
|
update: Update,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Deref for IndexUpdate {
|
||||||
|
type Target = Update;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Update {
|
||||||
|
&self.update
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DerefMut for IndexUpdate {
|
||||||
|
fn deref_mut(&mut self) -> &mut Update {
|
||||||
|
&mut self.update
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DatabaseIndex {
|
||||||
|
db: Arc<DB>,
|
||||||
|
|
||||||
|
// This view is updated each time the DB ingests an update.
|
||||||
|
view: ArcSwap<DatabaseView<Arc<DB>>>,
|
||||||
|
|
||||||
|
// The path of the mdb folder stored on disk.
|
||||||
|
path: PathBuf,
|
||||||
|
|
||||||
|
// must_die false by default, must be set as true when the Index is dropped.
|
||||||
|
// It is used to erase the folder saved on disk when the user request to delete an index.
|
||||||
|
must_die: AtomicBool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DatabaseIndex {
|
||||||
|
fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<DatabaseIndex, Box<Error>> {
|
||||||
let path = path.as_ref();
|
let path = path.as_ref();
|
||||||
if path.exists() {
|
if path.exists() {
|
||||||
return Err(format!("File already exists at path: {}, cannot create database.",
|
return Err(format!("File already exists at path: {}, cannot create database.",
|
||||||
path.display()).into())
|
path.display()).into())
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = path.to_string_lossy();
|
let path_lossy = path.to_string_lossy();
|
||||||
let mut opts = DBOptions::new();
|
let mut opts = DBOptions::new();
|
||||||
opts.create_if_missing(true);
|
opts.create_if_missing(true);
|
||||||
// opts.error_if_exists(true); // FIXME pull request that
|
// opts.error_if_exists(true); // FIXME pull request that
|
||||||
|
|
||||||
let mut cf_opts = ColumnFamilyOptions::new();
|
let mut cf_opts = ColumnFamilyOptions::new();
|
||||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
cf_opts.add_merge_operator("data merge operator", merge_operator);
|
||||||
|
|
||||||
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
|
||||||
|
|
||||||
let mut schema_bytes = Vec::new();
|
let mut schema_bytes = Vec::new();
|
||||||
schema.write_to_bin(&mut schema_bytes)?;
|
schema.write_to_bin(&mut schema_bytes)?;
|
||||||
@ -104,21 +211,26 @@ impl Database {
|
|||||||
|
|
||||||
let db = Arc::new(db);
|
let db = Arc::new(db);
|
||||||
let snapshot = Snapshot::new(db.clone());
|
let snapshot = Snapshot::new(db.clone());
|
||||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||||
|
|
||||||
Ok(Database { db: Mutex::new(db), view })
|
Ok(DatabaseIndex {
|
||||||
|
db: db,
|
||||||
|
view: view,
|
||||||
|
path: path.to_path_buf(),
|
||||||
|
must_die: AtomicBool::new(false)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
fn open<P: AsRef<Path>>(path: P) -> Result<DatabaseIndex, Box<Error>> {
|
||||||
let path = path.as_ref().to_string_lossy();
|
let path_lossy = path.as_ref().to_string_lossy();
|
||||||
|
|
||||||
let mut opts = DBOptions::new();
|
let mut opts = DBOptions::new();
|
||||||
opts.create_if_missing(false);
|
opts.create_if_missing(false);
|
||||||
|
|
||||||
let mut cf_opts = ColumnFamilyOptions::new();
|
let mut cf_opts = ColumnFamilyOptions::new();
|
||||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
cf_opts.add_merge_operator("data merge operator", merge_operator);
|
||||||
|
|
||||||
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
|
||||||
|
|
||||||
// FIXME create a generic function to do that !
|
// FIXME create a generic function to do that !
|
||||||
let _schema = match db.get(DATA_SCHEMA)? {
|
let _schema = match db.get(DATA_SCHEMA)? {
|
||||||
@ -128,79 +240,209 @@ impl Database {
|
|||||||
|
|
||||||
let db = Arc::new(db);
|
let db = Arc::new(db);
|
||||||
let snapshot = Snapshot::new(db.clone());
|
let snapshot = Snapshot::new(db.clone());
|
||||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||||
|
|
||||||
Ok(Database { db: Mutex::new(db), view })
|
Ok(DatabaseIndex {
|
||||||
|
db: db,
|
||||||
|
view: view,
|
||||||
|
path: path.as_ref().to_path_buf(),
|
||||||
|
must_die: AtomicBool::new(false)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn ingest_update_file(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
fn must_die(&self) {
|
||||||
let snapshot = {
|
self.must_die.store(true, Ordering::Relaxed)
|
||||||
// We must have a mutex here to ensure that update ingestions and compactions
|
}
|
||||||
// are done atomatically and in the right order.
|
|
||||||
// This way update ingestions will block other update ingestions without blocking view
|
fn start_update(&self) -> Result<Update, Box<Error>> {
|
||||||
// creations while doing the "data-index" compaction
|
let schema = match self.db.get(DATA_SCHEMA)? {
|
||||||
let db = match self.db.lock() {
|
Some(value) => Schema::read_from_bin(&*value)?,
|
||||||
Ok(db) => db,
|
None => panic!("Database does not contain a schema"),
|
||||||
Err(e) => return Err(e.to_string().into()),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let path = update.path().to_string_lossy();
|
Ok(Update::new(schema))
|
||||||
let options = IngestExternalFileOptions::new();
|
}
|
||||||
// options.move_files(move_update);
|
|
||||||
|
|
||||||
debug!("ingest update file");
|
fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
let batch = update.build()?;
|
||||||
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
|
self.db.write(batch)?;
|
||||||
|
|
||||||
debug!("compacting index range");
|
|
||||||
// Compacting to trigger the merge operator only one time
|
|
||||||
// while ingesting the update and not each time searching
|
|
||||||
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
|
|
||||||
|
|
||||||
Snapshot::new(db.clone())
|
|
||||||
};
|
|
||||||
|
|
||||||
|
let snapshot = Snapshot::new(self.db.clone());
|
||||||
let view = Arc::new(DatabaseView::new(snapshot)?);
|
let view = Arc::new(DatabaseView::new(snapshot)?);
|
||||||
self.view.set(view.clone());
|
self.view.store(view.clone());
|
||||||
|
|
||||||
Ok(view)
|
Ok(view)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
|
fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
|
||||||
self.view().get(key)
|
self.view.load()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn flush(&self) -> Result<(), Box<Error>> {
|
fn get_config(&self) -> Config {
|
||||||
match self.db.lock() {
|
self.view().config().clone()
|
||||||
Ok(db) => Ok(db.flush(true)?),
|
}
|
||||||
Err(e) => Err(e.to_string().into()),
|
|
||||||
|
fn update_config(&self, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
|
||||||
|
let data = bincode::serialize(&config)?;
|
||||||
|
self.db.put(CONFIG, &data)?;
|
||||||
|
|
||||||
|
let snapshot = Snapshot::new(self.db.clone());
|
||||||
|
let view = Arc::new(DatabaseView::new(snapshot)?);
|
||||||
|
self.view.store(view.clone());
|
||||||
|
|
||||||
|
Ok(view)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn path(&self) -> &Path {
|
||||||
|
self.path.as_path()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
|
impl Drop for DatabaseIndex {
|
||||||
self.view.get()
|
fn drop(&mut self) {
|
||||||
|
if self.must_die.load(Ordering::Relaxed) {
|
||||||
|
if let Err(err) = fs::remove_dir_all(&self.path) {
|
||||||
|
error!("Impossible to remove mdb when Database is dropped; {}", err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Database {
|
||||||
|
indexes: Map<String, Arc<DatabaseIndex>>,
|
||||||
|
path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Database {
|
||||||
|
pub fn create<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||||
|
Ok(Database {
|
||||||
|
indexes: Map::new(),
|
||||||
|
path: path.as_ref().to_path_buf(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||||
|
let entries = fs::read_dir(&path)?;
|
||||||
|
|
||||||
|
let indexes = Map::new();
|
||||||
|
for entry in entries {
|
||||||
|
let path = match entry {
|
||||||
|
Ok(p) => p.path(),
|
||||||
|
Err(err) => {
|
||||||
|
warn!("Impossible to retrieve the path from an entry; {}", err);
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let name = match path.file_stem().and_then(OsStr::to_str) {
|
||||||
|
Some(name) => name.to_owned(),
|
||||||
|
None => continue
|
||||||
|
};
|
||||||
|
|
||||||
|
let db = match DatabaseIndex::open(path.clone()) {
|
||||||
|
Ok(db) => db,
|
||||||
|
Err(err) => {
|
||||||
|
warn!("Impossible to open the database; {}", err);
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
info!("Load database {}", name);
|
||||||
|
indexes.insert(name, Arc::new(db));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Database {
|
||||||
|
indexes: indexes,
|
||||||
|
path: path.as_ref().to_path_buf(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box<Error>> {
|
||||||
|
let index_path = self.path.join(name);
|
||||||
|
|
||||||
|
if index_path.exists() {
|
||||||
|
return Err("Index already exists".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let index = DatabaseIndex::create(index_path, schema)?;
|
||||||
|
self.indexes.insert(name.to_owned(), Arc::new(index));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn delete_index(&self, name: &str) -> Result<(), Box<Error>> {
|
||||||
|
let index_guard = self.indexes.remove(name).ok_or("Index not found")?;
|
||||||
|
index_guard.val().must_die();
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn list_indexes(&self) -> Vec<String> {
|
||||||
|
self.indexes.iter().map(|g| g.key().clone()).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn start_update(&self, index: &str) -> Result<IndexUpdate, Box<Error>> {
|
||||||
|
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||||
|
let update = index_guard.val().start_update()?;
|
||||||
|
|
||||||
|
Ok(IndexUpdate { index: index.to_owned(), update })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn commit_update(&self, update: IndexUpdate)-> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||||
|
let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?;
|
||||||
|
|
||||||
|
index_guard.val().commit_update(update.update)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn view(&self, index: &str) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||||
|
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||||
|
|
||||||
|
Ok(index_guard.val().view())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_config(&self, index: &str) -> Result<Config, Box<Error>> {
|
||||||
|
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||||
|
|
||||||
|
Ok(index_guard.val().get_config())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn update_config(&self, index: &str, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
|
||||||
|
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||||
|
|
||||||
|
Ok(index_guard.val().update_config(config)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn path(&self) -> &Path {
|
||||||
|
self.path.as_path()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn index_path(&self, index: &str) -> Result<PathBuf, Box<Error>> {
|
||||||
|
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||||
|
let path = index_guard.val().path();
|
||||||
|
Ok(path.to_path_buf())
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use std::collections::HashSet;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use serde_derive::{Serialize, Deserialize};
|
use serde_derive::{Serialize, Deserialize};
|
||||||
use hashbrown::HashSet;
|
|
||||||
use tempfile::tempdir;
|
|
||||||
|
|
||||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||||
use crate::database::update::UpdateBuilder;
|
|
||||||
use crate::tokenizer::DefaultBuilder;
|
use crate::tokenizer::DefaultBuilder;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
fn ingest_one_easy_update() -> Result<(), Box<Error>> {
|
||||||
let dir = tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
let stop_words = HashSet::new();
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
let meilidb_path = dir.path().join("meilidb.mdb");
|
||||||
|
let meilidb_index_name = "default";
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||||
struct SimpleDoc {
|
struct SimpleDoc {
|
||||||
@ -219,9 +461,9 @@ mod tests {
|
|||||||
builder.build()
|
builder.build()
|
||||||
};
|
};
|
||||||
|
|
||||||
let database = Database::create(&rocksdb_path, &schema)?;
|
let database = Database::create(&meilidb_path)?;
|
||||||
|
|
||||||
let update_path = dir.path().join("update.sst");
|
database.create_index(meilidb_index_name, &schema)?;
|
||||||
|
|
||||||
let doc0 = SimpleDoc {
|
let doc0 = SimpleDoc {
|
||||||
id: 0,
|
id: 0,
|
||||||
@ -236,20 +478,13 @@ mod tests {
|
|||||||
timestamp: 7654321,
|
timestamp: 7654321,
|
||||||
};
|
};
|
||||||
|
|
||||||
let docid0;
|
|
||||||
let docid1;
|
|
||||||
let update = {
|
|
||||||
let tokenizer_builder = DefaultBuilder::new();
|
let tokenizer_builder = DefaultBuilder::new();
|
||||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
let mut builder = database.start_update(meilidb_index_name)?;
|
||||||
|
|
||||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||||
|
|
||||||
builder.build()?
|
let view = database.commit_update(builder)?;
|
||||||
};
|
|
||||||
|
|
||||||
database.ingest_update_file(update)?;
|
|
||||||
let view = database.view();
|
|
||||||
|
|
||||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||||
@ -261,11 +496,12 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
fn ingest_two_easy_updates() -> Result<(), Box<Error>> {
|
||||||
let dir = tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
let stop_words = HashSet::new();
|
let stop_words = HashSet::new();
|
||||||
|
|
||||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
let meilidb_path = dir.path().join("meilidb.mdb");
|
||||||
|
let meilidb_index_name = "default";
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||||
struct SimpleDoc {
|
struct SimpleDoc {
|
||||||
@ -284,7 +520,9 @@ mod tests {
|
|||||||
builder.build()
|
builder.build()
|
||||||
};
|
};
|
||||||
|
|
||||||
let database = Database::create(&rocksdb_path, &schema)?;
|
let database = Database::create(&meilidb_path)?;
|
||||||
|
|
||||||
|
database.create_index(meilidb_index_name, &schema)?;
|
||||||
|
|
||||||
let doc0 = SimpleDoc {
|
let doc0 = SimpleDoc {
|
||||||
id: 0,
|
id: 0,
|
||||||
@ -311,36 +549,17 @@ mod tests {
|
|||||||
timestamp: 7654321,
|
timestamp: 7654321,
|
||||||
};
|
};
|
||||||
|
|
||||||
let docid0;
|
|
||||||
let docid1;
|
|
||||||
let update1 = {
|
|
||||||
let tokenizer_builder = DefaultBuilder::new();
|
let tokenizer_builder = DefaultBuilder::new();
|
||||||
let update_path = dir.path().join("update-000.sst");
|
|
||||||
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
|
||||||
|
|
||||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
let mut builder = database.start_update(meilidb_index_name)?;
|
||||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||||
|
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||||
|
database.commit_update(builder)?;
|
||||||
|
|
||||||
builder.build()?
|
let mut builder = database.start_update(meilidb_index_name)?;
|
||||||
};
|
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||||
|
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||||
let docid2;
|
let view = database.commit_update(builder)?;
|
||||||
let docid3;
|
|
||||||
let update2 = {
|
|
||||||
let tokenizer_builder = DefaultBuilder::new();
|
|
||||||
let update_path = dir.path().join("update-001.sst");
|
|
||||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
|
||||||
|
|
||||||
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
|
||||||
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
|
||||||
|
|
||||||
builder.build()?
|
|
||||||
};
|
|
||||||
|
|
||||||
database.ingest_update_file(update1)?;
|
|
||||||
database.ingest_update_file(update2)?;
|
|
||||||
|
|
||||||
let view = database.view();
|
|
||||||
|
|
||||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||||
@ -362,7 +581,7 @@ mod tests {
|
|||||||
mod bench {
|
mod bench {
|
||||||
extern crate test;
|
extern crate test;
|
||||||
|
|
||||||
use super::*;
|
use std::collections::HashSet;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::iter::repeat_with;
|
use std::iter::repeat_with;
|
||||||
use self::test::Bencher;
|
use self::test::Bencher;
|
||||||
@ -372,12 +591,12 @@ mod bench {
|
|||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
use serde_derive::Serialize;
|
use serde_derive::Serialize;
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use hashbrown::HashSet;
|
|
||||||
|
|
||||||
use crate::tokenizer::DefaultBuilder;
|
use crate::tokenizer::DefaultBuilder;
|
||||||
use crate::database::update::UpdateBuilder;
|
|
||||||
use crate::database::schema::*;
|
use crate::database::schema::*;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
|
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
|
||||||
let mut words = String::new();
|
let mut words = String::new();
|
||||||
|
|
||||||
@ -409,7 +628,10 @@ mod bench {
|
|||||||
let schema = builder.build();
|
let schema = builder.build();
|
||||||
|
|
||||||
let db_path = dir.path().join("bench.mdb");
|
let db_path = dir.path().join("bench.mdb");
|
||||||
let database = Database::create(db_path.clone(), &schema)?;
|
let index_name = "default";
|
||||||
|
|
||||||
|
let database = Database::create(&db_path)?;
|
||||||
|
database.create_index(index_name, &schema)?;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct Document {
|
struct Document {
|
||||||
@ -418,9 +640,8 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = dir.path().join("update-000.sst");
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
let tokenizer_builder = DefaultBuilder;
|
||||||
let mut builder = UpdateBuilder::new(path, schema);
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
for i in 0..300 {
|
for i in 0..300 {
|
||||||
@ -432,8 +653,7 @@ mod bench {
|
|||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
database.commit_update(builder)?;
|
||||||
database.ingest_update_file(update)?;
|
|
||||||
|
|
||||||
drop(database);
|
drop(database);
|
||||||
|
|
||||||
@ -456,7 +676,10 @@ mod bench {
|
|||||||
let schema = builder.build();
|
let schema = builder.build();
|
||||||
|
|
||||||
let db_path = dir.path().join("bench.mdb");
|
let db_path = dir.path().join("bench.mdb");
|
||||||
let database = Database::create(db_path.clone(), &schema)?;
|
let index_name = "default";
|
||||||
|
|
||||||
|
let database = Database::create(&db_path)?;
|
||||||
|
database.create_index(index_name, &schema)?;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct Document {
|
struct Document {
|
||||||
@ -465,9 +688,8 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = dir.path().join("update-000.sst");
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
let tokenizer_builder = DefaultBuilder;
|
||||||
let mut builder = UpdateBuilder::new(path, schema);
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
for i in 0..3000 {
|
for i in 0..3000 {
|
||||||
@ -479,8 +701,7 @@ mod bench {
|
|||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
database.commit_update(builder)?;
|
||||||
database.ingest_update_file(update)?;
|
|
||||||
|
|
||||||
drop(database);
|
drop(database);
|
||||||
|
|
||||||
@ -504,7 +725,10 @@ mod bench {
|
|||||||
let schema = builder.build();
|
let schema = builder.build();
|
||||||
|
|
||||||
let db_path = dir.path().join("bench.mdb");
|
let db_path = dir.path().join("bench.mdb");
|
||||||
let database = Database::create(db_path.clone(), &schema)?;
|
let index_name = "default";
|
||||||
|
|
||||||
|
let database = Database::create(&db_path)?;
|
||||||
|
database.create_index(index_name, &schema)?;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct Document {
|
struct Document {
|
||||||
@ -513,9 +737,8 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = dir.path().join("update-000.sst");
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
let tokenizer_builder = DefaultBuilder;
|
||||||
let mut builder = UpdateBuilder::new(path, schema);
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
for i in 0..30_000 {
|
for i in 0..30_000 {
|
||||||
@ -527,8 +750,7 @@ mod bench {
|
|||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
database.commit_update(builder)?;
|
||||||
database.ingest_update_file(update)?;
|
|
||||||
|
|
||||||
drop(database);
|
drop(database);
|
||||||
|
|
||||||
@ -551,7 +773,10 @@ mod bench {
|
|||||||
let schema = builder.build();
|
let schema = builder.build();
|
||||||
|
|
||||||
let db_path = dir.path().join("bench.mdb");
|
let db_path = dir.path().join("bench.mdb");
|
||||||
let database = Database::create(db_path.clone(), &schema)?;
|
let index_name = "default";
|
||||||
|
|
||||||
|
let database = Database::create(&db_path)?;
|
||||||
|
database.create_index(index_name, &schema)?;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct Document {
|
struct Document {
|
||||||
@ -560,9 +785,8 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = dir.path().join("update-000.sst");
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
let tokenizer_builder = DefaultBuilder;
|
||||||
let mut builder = UpdateBuilder::new(path, schema);
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
for i in 0..300 {
|
for i in 0..300 {
|
||||||
@ -574,12 +798,11 @@ mod bench {
|
|||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let view = database.commit_update(builder)?;
|
||||||
let view = database.ingest_update_file(update)?;
|
|
||||||
|
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
for q in &["a", "b", "c", "d", "e"] {
|
for q in &["a", "b", "c", "d", "e"] {
|
||||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
let documents = view.query_builder().query(q, 0..20);
|
||||||
test::black_box(|| documents);
|
test::black_box(|| documents);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -598,7 +821,10 @@ mod bench {
|
|||||||
let schema = builder.build();
|
let schema = builder.build();
|
||||||
|
|
||||||
let db_path = dir.path().join("bench.mdb");
|
let db_path = dir.path().join("bench.mdb");
|
||||||
let database = Database::create(db_path.clone(), &schema)?;
|
let index_name = "default";
|
||||||
|
|
||||||
|
let database = Database::create(&db_path)?;
|
||||||
|
database.create_index(index_name, &schema)?;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct Document {
|
struct Document {
|
||||||
@ -607,9 +833,8 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = dir.path().join("update-000.sst");
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
let tokenizer_builder = DefaultBuilder;
|
||||||
let mut builder = UpdateBuilder::new(path, schema);
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
for i in 0..3000 {
|
for i in 0..3000 {
|
||||||
@ -621,12 +846,11 @@ mod bench {
|
|||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let view = database.commit_update(builder)?;
|
||||||
let view = database.ingest_update_file(update)?;
|
|
||||||
|
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
for q in &["a", "b", "c", "d", "e"] {
|
for q in &["a", "b", "c", "d", "e"] {
|
||||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
let documents = view.query_builder().query(q, 0..20);
|
||||||
test::black_box(|| documents);
|
test::black_box(|| documents);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -646,7 +870,10 @@ mod bench {
|
|||||||
let schema = builder.build();
|
let schema = builder.build();
|
||||||
|
|
||||||
let db_path = dir.path().join("bench.mdb");
|
let db_path = dir.path().join("bench.mdb");
|
||||||
let database = Database::create(db_path.clone(), &schema)?;
|
let index_name = "default";
|
||||||
|
|
||||||
|
let database = Database::create(&db_path)?;
|
||||||
|
database.create_index(index_name, &schema)?;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct Document {
|
struct Document {
|
||||||
@ -655,9 +882,8 @@ mod bench {
|
|||||||
description: String,
|
description: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
let path = dir.path().join("update-000.sst");
|
|
||||||
let tokenizer_builder = DefaultBuilder;
|
let tokenizer_builder = DefaultBuilder;
|
||||||
let mut builder = UpdateBuilder::new(path, schema);
|
let mut builder = database.start_update(index_name)?;
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||||
|
|
||||||
for i in 0..30_000 {
|
for i in 0..30_000 {
|
||||||
@ -669,12 +895,11 @@ mod bench {
|
|||||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = builder.build()?;
|
let view = database.commit_update(builder)?;
|
||||||
let view = database.ingest_update_file(update)?;
|
|
||||||
|
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
for q in &["a", "b", "c", "d", "e"] {
|
for q in &["a", "b", "c", "d", "e"] {
|
||||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
let documents = view.query_builder().query(q, 0..20);
|
||||||
test::black_box(|| documents);
|
test::black_box(|| documents);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
98
src/database/number.rs
Normal file
98
src/database/number.rs
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::str::FromStr;
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
use serde_derive::{Serialize, Deserialize};
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
pub enum Number {
|
||||||
|
Unsigned(u64),
|
||||||
|
Signed(i64),
|
||||||
|
Float(f64),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for Number {
|
||||||
|
type Err = ParseNumberError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
if let Ok(unsigned) = u64::from_str(s) {
|
||||||
|
return Ok(Number::Unsigned(unsigned))
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(signed) = i64::from_str(s) {
|
||||||
|
return Ok(Number::Signed(signed))
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(float) = f64::from_str(s) {
|
||||||
|
if float == 0.0 || float.is_normal() {
|
||||||
|
return Ok(Number::Float(float))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(ParseNumberError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialOrd for Number {
|
||||||
|
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ord for Number {
|
||||||
|
fn cmp(&self, other: &Number) -> Ordering {
|
||||||
|
use Number::*;
|
||||||
|
match (self, other) {
|
||||||
|
(Unsigned(s), Unsigned(o)) => s.cmp(o),
|
||||||
|
(Unsigned(s), Signed(o)) => {
|
||||||
|
let s = i128::from(*s);
|
||||||
|
let o = i128::from(*o);
|
||||||
|
s.cmp(&o)
|
||||||
|
},
|
||||||
|
(Unsigned(s), Float(o)) => {
|
||||||
|
let s = *s as f64;
|
||||||
|
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
|
||||||
|
},
|
||||||
|
|
||||||
|
(Signed(s), Unsigned(o)) => {
|
||||||
|
let s = i128::from(*s);
|
||||||
|
let o = i128::from(*o);
|
||||||
|
s.cmp(&o)
|
||||||
|
},
|
||||||
|
(Signed(s), Signed(o)) => s.cmp(o),
|
||||||
|
(Signed(s), Float(o)) => {
|
||||||
|
let s = *s as f64;
|
||||||
|
s.partial_cmp(o).unwrap_or(Ordering::Equal)
|
||||||
|
},
|
||||||
|
|
||||||
|
(Float(s), Unsigned(o)) => {
|
||||||
|
let o = *o as f64;
|
||||||
|
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
|
||||||
|
},
|
||||||
|
(Float(s), Signed(o)) => {
|
||||||
|
let o = *o as f64;
|
||||||
|
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
|
||||||
|
},
|
||||||
|
(Float(s), Float(o)) => {
|
||||||
|
s.partial_cmp(o).unwrap_or(Ordering::Equal)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for Number {
|
||||||
|
fn eq(&self, other: &Number) -> bool {
|
||||||
|
self.cmp(other) == Ordering::Equal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for Number { }
|
||||||
|
|
||||||
|
pub struct ParseNumberError;
|
||||||
|
|
||||||
|
impl fmt::Display for ParseNumberError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
f.write_str("can not parse number")
|
||||||
|
}
|
||||||
|
}
|
@ -7,14 +7,14 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use serde_derive::{Serialize, Deserialize};
|
use serde_derive::{Serialize, Deserialize};
|
||||||
use linked_hash_map::LinkedHashMap;
|
use linked_hash_map::LinkedHashMap;
|
||||||
use serde::Serialize;
|
|
||||||
|
|
||||||
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
|
|
||||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
|
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
|
||||||
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
|
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
|
||||||
|
pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true };
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct SchemaProps {
|
pub struct SchemaProps {
|
||||||
@ -23,6 +23,9 @@ pub struct SchemaProps {
|
|||||||
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
indexed: bool,
|
indexed: bool,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
ranked: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SchemaProps {
|
impl SchemaProps {
|
||||||
@ -33,6 +36,10 @@ impl SchemaProps {
|
|||||||
pub fn is_indexed(self) -> bool {
|
pub fn is_indexed(self) -> bool {
|
||||||
self.indexed
|
self.indexed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_ranked(self) -> bool {
|
||||||
|
self.ranked
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BitOr for SchemaProps {
|
impl BitOr for SchemaProps {
|
||||||
@ -42,6 +49,7 @@ impl BitOr for SchemaProps {
|
|||||||
SchemaProps {
|
SchemaProps {
|
||||||
stored: self.stored | other.stored,
|
stored: self.stored | other.stored,
|
||||||
indexed: self.indexed | other.indexed,
|
indexed: self.indexed | other.indexed,
|
||||||
|
ranked: self.ranked | other.ranked,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -113,6 +121,23 @@ impl Schema {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
reader.read_to_end(&mut buffer)?;
|
||||||
|
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
|
||||||
|
Ok(builder.build())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
|
||||||
|
let identifier = self.inner.identifier.clone();
|
||||||
|
let attributes = self.attributes_ordered();
|
||||||
|
let builder = SchemaBuilder { identifier, attributes };
|
||||||
|
let string = serde_json::to_string_pretty(&builder)?;
|
||||||
|
writer.write_all(string.as_bytes())?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
|
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
|
||||||
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
|
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
|
||||||
Ok(builder.build())
|
Ok(builder.build())
|
||||||
@ -142,7 +167,7 @@ impl Schema {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
||||||
where T: Serialize,
|
where T: serde::Serialize,
|
||||||
{
|
{
|
||||||
let id_attribute_name = &self.inner.identifier;
|
let id_attribute_name = &self.inner.identifier;
|
||||||
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
||||||
@ -168,7 +193,8 @@ impl Schema {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
|
#[derive(Serialize, Deserialize)]
|
||||||
|
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||||
pub struct SchemaAttr(pub(crate) u16);
|
pub struct SchemaAttr(pub(crate) u16);
|
||||||
|
|
||||||
impl SchemaAttr {
|
impl SchemaAttr {
|
||||||
@ -254,4 +280,40 @@ mod tests {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialize_deserialize_json() -> Result<(), Box<Error>> {
|
||||||
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
|
builder.new_attribute("alpha", STORED);
|
||||||
|
builder.new_attribute("beta", STORED | INDEXED);
|
||||||
|
builder.new_attribute("gamma", INDEXED);
|
||||||
|
let schema = builder.build();
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
schema.to_json(&mut buffer)?;
|
||||||
|
|
||||||
|
let schema2 = Schema::from_json(buffer.as_slice())?;
|
||||||
|
assert_eq!(schema, schema2);
|
||||||
|
|
||||||
|
let data = r#"
|
||||||
|
{
|
||||||
|
"identifier": "id",
|
||||||
|
"attributes": {
|
||||||
|
"alpha": {
|
||||||
|
"stored": true
|
||||||
|
},
|
||||||
|
"beta": {
|
||||||
|
"stored": true,
|
||||||
|
"indexed": true
|
||||||
|
},
|
||||||
|
"gamma": {
|
||||||
|
"indexed": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"#;
|
||||||
|
let schema2 = Schema::from_json(data.as_bytes())?;
|
||||||
|
assert_eq!(schema, schema2);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,23 +1,24 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
use serde::Serialize;
|
||||||
|
use serde::ser;
|
||||||
|
|
||||||
use crate::database::update::DocumentUpdate;
|
use crate::database::update::DocumentUpdate;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
use crate::tokenizer::Token;
|
use crate::tokenizer::Token;
|
||||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
use crate::{is_cjk, DocumentId, DocIndex};
|
||||||
|
|
||||||
use hashbrown::HashSet;
|
pub struct IndexerSerializer<'a, 'b, B> {
|
||||||
use serde::Serialize;
|
|
||||||
use serde::ser;
|
|
||||||
|
|
||||||
pub struct IndexerSerializer<'a, B> {
|
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub attribute: SchemaAttr,
|
pub attribute: SchemaAttr,
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
|
||||||
where B: TokenizerBuilder
|
where B: TokenizerBuilder
|
||||||
{
|
{
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
@ -54,10 +55,8 @@ where B: TokenizerBuilder
|
|||||||
let document_id = self.document_id;
|
let document_id = self.document_id;
|
||||||
|
|
||||||
// FIXME must u32::try_from instead
|
// FIXME must u32::try_from instead
|
||||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
let attribute = self.attribute.0;
|
||||||
Ok(attribute) => attribute,
|
let word_index = word_index as u32;
|
||||||
Err(_) => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
// insert the exact representation
|
// insert the exact representation
|
||||||
let word_lower = word.to_lowercase();
|
let word_lower = word.to_lowercase();
|
||||||
@ -66,24 +65,23 @@ where B: TokenizerBuilder
|
|||||||
if self.stop_words.contains(&word_lower) { continue }
|
if self.stop_words.contains(&word_lower) { continue }
|
||||||
|
|
||||||
// and the unidecoded lowercased version
|
// and the unidecoded lowercased version
|
||||||
|
if !word_lower.chars().any(is_cjk) {
|
||||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||||
|
let word_unidecoded = word_unidecoded.trim();
|
||||||
if word_lower != word_unidecoded {
|
if word_lower != word_unidecoded {
|
||||||
let word_area = match WordArea::new(char_index as u32, length) {
|
let char_index = char_index as u32;
|
||||||
Ok(word_area) => word_area,
|
let char_length = length;
|
||||||
Err(_) => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||||
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let word_area = match WordArea::new(char_index as u32, length) {
|
let char_index = char_index as u32;
|
||||||
Ok(word_area) => word_area,
|
let char_length = length;
|
||||||
Err(_) => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||||
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
|
self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -17,8 +17,10 @@ macro_rules! forward_to_unserializable_type {
|
|||||||
|
|
||||||
pub mod find_id;
|
pub mod find_id;
|
||||||
pub mod key_to_string;
|
pub mod key_to_string;
|
||||||
|
pub mod value_to_number;
|
||||||
pub mod serializer;
|
pub mod serializer;
|
||||||
pub mod indexer_serializer;
|
pub mod indexer_serializer;
|
||||||
|
pub mod deserializer;
|
||||||
|
|
||||||
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||||
let mut s = DefaultHasher::new();
|
let mut s = DefaultHasher::new();
|
||||||
@ -55,3 +57,9 @@ impl fmt::Display for SerializerError {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Error for SerializerError {}
|
impl Error for SerializerError {}
|
||||||
|
|
||||||
|
impl From<String> for SerializerError {
|
||||||
|
fn from(value: String) -> SerializerError {
|
||||||
|
SerializerError::Custom(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,24 +1,26 @@
|
|||||||
use hashbrown::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
|
||||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||||
|
use crate::database::serde::value_to_number::ValueToNumberSerializer;
|
||||||
use crate::database::update::DocumentUpdate;
|
use crate::database::update::DocumentUpdate;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
|
|
||||||
pub struct Serializer<'a, B> {
|
pub struct Serializer<'a, 'b, B> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
|
||||||
where B: TokenizerBuilder
|
where B: TokenizerBuilder
|
||||||
{
|
{
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
@ -27,8 +29,8 @@ where B: TokenizerBuilder
|
|||||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeMap = MapSerializer<'a, B>;
|
type SerializeMap = MapSerializer<'a, 'b, B>;
|
||||||
type SerializeStruct = StructSerializer<'a, B>;
|
type SerializeStruct = StructSerializer<'a, 'b, B>;
|
||||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
|
||||||
forward_to_unserializable_type! {
|
forward_to_unserializable_type! {
|
||||||
@ -154,8 +156,8 @@ where B: TokenizerBuilder
|
|||||||
{
|
{
|
||||||
Ok(StructSerializer {
|
Ok(StructSerializer {
|
||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
update: self.update,
|
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
|
update: self.update,
|
||||||
tokenizer_builder: self.tokenizer_builder,
|
tokenizer_builder: self.tokenizer_builder,
|
||||||
stop_words: self.stop_words,
|
stop_words: self.stop_words,
|
||||||
})
|
})
|
||||||
@ -173,16 +175,16 @@ where B: TokenizerBuilder
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct MapSerializer<'a, B> {
|
pub struct MapSerializer<'a, 'b, B> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
pub current_key_name: Option<String>,
|
pub current_key_name: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
|
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
|
||||||
where B: TokenizerBuilder
|
where B: TokenizerBuilder
|
||||||
{
|
{
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
@ -206,7 +208,7 @@ where B: TokenizerBuilder
|
|||||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: &K,
|
key: &K,
|
||||||
value: &V
|
value: &V,
|
||||||
) -> Result<(), Self::Error>
|
) -> Result<(), Self::Error>
|
||||||
where K: Serialize, V: Serialize,
|
where K: Serialize, V: Serialize,
|
||||||
{
|
{
|
||||||
@ -216,7 +218,7 @@ where B: TokenizerBuilder
|
|||||||
let props = self.schema.props(attr);
|
let props = self.schema.props(attr);
|
||||||
if props.is_stored() {
|
if props.is_stored() {
|
||||||
let value = bincode::serialize(value).unwrap();
|
let value = bincode::serialize(value).unwrap();
|
||||||
self.update.insert_attribute_value(attr, value);
|
self.update.insert_attribute_value(attr, &value)?;
|
||||||
}
|
}
|
||||||
if props.is_indexed() {
|
if props.is_indexed() {
|
||||||
let serializer = IndexerSerializer {
|
let serializer = IndexerSerializer {
|
||||||
@ -228,6 +230,10 @@ where B: TokenizerBuilder
|
|||||||
};
|
};
|
||||||
value.serialize(serializer)?;
|
value.serialize(serializer)?;
|
||||||
}
|
}
|
||||||
|
if props.is_ranked() {
|
||||||
|
let number = value.serialize(ValueToNumberSerializer)?;
|
||||||
|
self.update.register_ranked_attribute(attr, number)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -238,15 +244,15 @@ where B: TokenizerBuilder
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct StructSerializer<'a, B> {
|
pub struct StructSerializer<'a, 'b, B> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub update: &'a mut DocumentUpdate,
|
pub update: &'a mut DocumentUpdate<'b>,
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
pub stop_words: &'a HashSet<String>,
|
pub stop_words: &'a HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
|
||||||
where B: TokenizerBuilder
|
where B: TokenizerBuilder
|
||||||
{
|
{
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
@ -263,7 +269,7 @@ where B: TokenizerBuilder
|
|||||||
let props = self.schema.props(attr);
|
let props = self.schema.props(attr);
|
||||||
if props.is_stored() {
|
if props.is_stored() {
|
||||||
let value = bincode::serialize(value).unwrap();
|
let value = bincode::serialize(value).unwrap();
|
||||||
self.update.insert_attribute_value(attr, value);
|
self.update.insert_attribute_value(attr, &value)?;
|
||||||
}
|
}
|
||||||
if props.is_indexed() {
|
if props.is_indexed() {
|
||||||
let serializer = IndexerSerializer {
|
let serializer = IndexerSerializer {
|
||||||
@ -275,6 +281,10 @@ where B: TokenizerBuilder
|
|||||||
};
|
};
|
||||||
value.serialize(serializer)?;
|
value.serialize(serializer)?;
|
||||||
}
|
}
|
||||||
|
if props.is_ranked() {
|
||||||
|
let integer = value.serialize(ValueToNumberSerializer)?;
|
||||||
|
self.update.register_ranked_attribute(attr, integer)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
176
src/database/serde/value_to_number.rs
Normal file
176
src/database/serde/value_to_number.rs
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use serde::Serialize;
|
||||||
|
use serde::{ser, ser::Error};
|
||||||
|
|
||||||
|
use crate::database::serde::SerializerError;
|
||||||
|
use crate::database::Number;
|
||||||
|
|
||||||
|
pub struct ValueToNumberSerializer;
|
||||||
|
|
||||||
|
impl ser::Serializer for ValueToNumberSerializer {
|
||||||
|
type Ok = Number;
|
||||||
|
type Error = SerializerError;
|
||||||
|
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
|
||||||
|
forward_to_unserializable_type! {
|
||||||
|
bool => serialize_bool,
|
||||||
|
char => serialize_char,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Signed(value as i64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Signed(value as i64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Signed(value as i64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Signed(value as i64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Unsigned(value as u64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Unsigned(value as u64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Unsigned(value as u64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Unsigned(value as u64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Float(value as f64))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Ok(Number::Float(value))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Number::from_str(value).map_err(SerializerError::custom)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "Option" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||||
|
where T: Serialize,
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "Option" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "()" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_unit_variant(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str
|
||||||
|
) -> Result<Self::Ok, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_newtype_struct<T: ?Sized>(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
value: &T
|
||||||
|
) -> Result<Self::Ok, Self::Error>
|
||||||
|
where T: Serialize,
|
||||||
|
{
|
||||||
|
value.serialize(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_newtype_variant<T: ?Sized>(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str,
|
||||||
|
_value: &T
|
||||||
|
) -> Result<Self::Ok, Self::Error>
|
||||||
|
where T: Serialize,
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_tuple_struct(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_tuple_variant(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "map" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_struct(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeStruct, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "struct" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_struct_variant(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||||
|
}
|
||||||
|
}
|
@ -1,64 +0,0 @@
|
|||||||
use std::path::PathBuf;
|
|
||||||
use std::error::Error;
|
|
||||||
|
|
||||||
use hashbrown::HashSet;
|
|
||||||
use serde::Serialize;
|
|
||||||
|
|
||||||
use crate::database::serde::serializer::Serializer;
|
|
||||||
use crate::database::serde::SerializerError;
|
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
|
||||||
use crate::database::Schema;
|
|
||||||
|
|
||||||
use crate::DocumentId;
|
|
||||||
use super::{Update, RawUpdateBuilder};
|
|
||||||
|
|
||||||
pub struct UpdateBuilder {
|
|
||||||
schema: Schema,
|
|
||||||
raw_builder: RawUpdateBuilder,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UpdateBuilder {
|
|
||||||
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
|
|
||||||
UpdateBuilder {
|
|
||||||
schema: schema,
|
|
||||||
raw_builder: RawUpdateBuilder::new(path),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn update_document<T, B>(
|
|
||||||
&mut self,
|
|
||||||
document: T,
|
|
||||||
tokenizer_builder: &B,
|
|
||||||
stop_words: &HashSet<String>,
|
|
||||||
) -> Result<DocumentId, SerializerError>
|
|
||||||
where T: Serialize,
|
|
||||||
B: TokenizerBuilder,
|
|
||||||
{
|
|
||||||
let document_id = self.schema.document_id(&document)?;
|
|
||||||
let update = self.raw_builder.document_update(document_id);
|
|
||||||
|
|
||||||
let serializer = Serializer {
|
|
||||||
schema: &self.schema,
|
|
||||||
document_id: document_id,
|
|
||||||
tokenizer_builder: tokenizer_builder,
|
|
||||||
update: update,
|
|
||||||
stop_words: stop_words,
|
|
||||||
};
|
|
||||||
|
|
||||||
document.serialize(serializer)?;
|
|
||||||
|
|
||||||
Ok(document_id)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
|
||||||
where T: Serialize,
|
|
||||||
{
|
|
||||||
let document_id = self.schema.document_id(&document)?;
|
|
||||||
self.raw_builder.document_update(document_id).remove();
|
|
||||||
Ok(document_id)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
|
||||||
self.raw_builder.build()
|
|
||||||
}
|
|
||||||
}
|
|
55
src/database/update/index_event.rs
Normal file
55
src/database/update/index_event.rs
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||||
|
|
||||||
|
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||||
|
use crate::write_to_bytes::WriteToBytes;
|
||||||
|
use crate::database::Index;
|
||||||
|
use crate::data::DocIds;
|
||||||
|
|
||||||
|
pub enum WriteIndexEvent<'a> {
|
||||||
|
RemovedDocuments(&'a DocIds),
|
||||||
|
UpdatedDocuments(&'a Index),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> WriteToBytes for WriteIndexEvent<'a> {
|
||||||
|
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||||
|
match self {
|
||||||
|
WriteIndexEvent::RemovedDocuments(doc_ids) => {
|
||||||
|
let _ = bytes.write_u8(0);
|
||||||
|
doc_ids.write_to_bytes(bytes);
|
||||||
|
},
|
||||||
|
WriteIndexEvent::UpdatedDocuments(index) => {
|
||||||
|
let _ = bytes.write_u8(1);
|
||||||
|
index.write_to_bytes(bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ReadIndexEvent {
|
||||||
|
RemovedDocuments(DocIds),
|
||||||
|
UpdatedDocuments(Index),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReadIndexEvent {
|
||||||
|
pub fn updated_documents(self) -> Option<Index> {
|
||||||
|
use ReadIndexEvent::*;
|
||||||
|
match self {
|
||||||
|
RemovedDocuments(_) => None,
|
||||||
|
UpdatedDocuments(index) => Some(index),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromSharedDataCursor for ReadIndexEvent {
|
||||||
|
type Error = Box<Error>;
|
||||||
|
|
||||||
|
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
|
||||||
|
match cursor.read_u8()? {
|
||||||
|
0 => DocIds::from_shared_data_cursor(cursor).map(ReadIndexEvent::RemovedDocuments),
|
||||||
|
1 => Index::from_shared_data_cursor(cursor).map(ReadIndexEvent::UpdatedDocuments),
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,17 +1,239 @@
|
|||||||
use std::path::{Path, PathBuf};
|
use std::collections::{HashSet, BTreeMap};
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
mod builder;
|
use rocksdb::rocksdb::{Writable, WriteBatch};
|
||||||
mod raw_builder;
|
use hashbrown::hash_map::HashMap;
|
||||||
|
use sdset::{Set, SetBuf};
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
pub use self::builder::UpdateBuilder;
|
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||||
pub use self::raw_builder::{RawUpdateBuilder, DocumentUpdate};
|
use crate::database::serde::serializer::Serializer;
|
||||||
|
use crate::database::serde::SerializerError;
|
||||||
|
use crate::database::schema::SchemaAttr;
|
||||||
|
use crate::database::schema::Schema;
|
||||||
|
use crate::database::index::IndexBuilder;
|
||||||
|
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
||||||
|
use crate::database::{RankedMap, Number};
|
||||||
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
|
use crate::write_to_bytes::WriteToBytes;
|
||||||
|
use crate::data::DocIds;
|
||||||
|
use crate::{DocumentId, DocIndex};
|
||||||
|
|
||||||
|
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
||||||
|
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
||||||
|
|
||||||
|
mod index_event;
|
||||||
|
mod ranked_map_event;
|
||||||
|
|
||||||
|
pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||||
|
|
||||||
pub struct Update {
|
pub struct Update {
|
||||||
sst_file: PathBuf,
|
schema: Schema,
|
||||||
|
raw_builder: RawUpdateBuilder,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Update {
|
impl Update {
|
||||||
pub fn path(&self) -> &Path {
|
pub(crate) fn new(schema: Schema) -> Update {
|
||||||
&self.sst_file
|
Update { schema, raw_builder: RawUpdateBuilder::new() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn update_document<T, B>(
|
||||||
|
&mut self,
|
||||||
|
document: T,
|
||||||
|
tokenizer_builder: &B,
|
||||||
|
stop_words: &HashSet<String>,
|
||||||
|
) -> Result<DocumentId, SerializerError>
|
||||||
|
where T: Serialize,
|
||||||
|
B: TokenizerBuilder,
|
||||||
|
{
|
||||||
|
let document_id = self.schema.document_id(&document)?;
|
||||||
|
|
||||||
|
let serializer = Serializer {
|
||||||
|
schema: &self.schema,
|
||||||
|
document_id: document_id,
|
||||||
|
tokenizer_builder: tokenizer_builder,
|
||||||
|
update: &mut self.raw_builder.document_update(document_id)?,
|
||||||
|
stop_words: stop_words,
|
||||||
|
};
|
||||||
|
|
||||||
|
document.serialize(serializer)?;
|
||||||
|
|
||||||
|
Ok(document_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||||
|
where T: Serialize,
|
||||||
|
{
|
||||||
|
let document_id = self.schema.document_id(&document)?;
|
||||||
|
self.raw_builder.document_update(document_id)?.remove()?;
|
||||||
|
Ok(document_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn build(self) -> Result<WriteBatch, Box<Error>> {
|
||||||
|
self.raw_builder.build()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||||
|
enum UpdateType {
|
||||||
|
Updated,
|
||||||
|
Deleted,
|
||||||
|
}
|
||||||
|
|
||||||
|
use UpdateType::{Updated, Deleted};
|
||||||
|
|
||||||
|
pub struct RawUpdateBuilder {
|
||||||
|
documents_update: HashMap<DocumentId, UpdateType>,
|
||||||
|
documents_ranked_fields: RankedMap,
|
||||||
|
indexed_words: BTreeMap<Token, Vec<DocIndex>>,
|
||||||
|
batch: WriteBatch,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RawUpdateBuilder {
|
||||||
|
pub fn new() -> RawUpdateBuilder {
|
||||||
|
RawUpdateBuilder {
|
||||||
|
documents_update: HashMap::new(),
|
||||||
|
documents_ranked_fields: HashMap::new(),
|
||||||
|
indexed_words: BTreeMap::new(),
|
||||||
|
batch: WriteBatch::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn document_update(&mut self, document_id: DocumentId) -> Result<DocumentUpdate, SerializerError> {
|
||||||
|
use serde::ser::Error;
|
||||||
|
|
||||||
|
match self.documents_update.get(&document_id) {
|
||||||
|
Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }),
|
||||||
|
Some(Updated) => Err(SerializerError::custom(
|
||||||
|
"This document has already been removed and cannot be updated in the same update"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(self) -> Result<WriteBatch, Box<Error>> {
|
||||||
|
// create the list of all the removed documents
|
||||||
|
let removed_documents = {
|
||||||
|
let mut document_ids = Vec::new();
|
||||||
|
for (id, update_type) in self.documents_update {
|
||||||
|
if update_type == Deleted {
|
||||||
|
document_ids.push(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document_ids.sort_unstable();
|
||||||
|
let setbuf = SetBuf::new_unchecked(document_ids);
|
||||||
|
DocIds::new(&setbuf)
|
||||||
|
};
|
||||||
|
|
||||||
|
// create the Index of all the document updates
|
||||||
|
let index = {
|
||||||
|
let mut builder = IndexBuilder::new();
|
||||||
|
for (key, mut indexes) in self.indexed_words {
|
||||||
|
indexes.sort_unstable();
|
||||||
|
let indexes = Set::new_unchecked(&indexes);
|
||||||
|
builder.insert(key, indexes).unwrap();
|
||||||
|
}
|
||||||
|
builder.build()
|
||||||
|
};
|
||||||
|
|
||||||
|
// WARN: removed documents must absolutely
|
||||||
|
// be merged *before* document updates
|
||||||
|
|
||||||
|
// === index ===
|
||||||
|
|
||||||
|
if !removed_documents.is_empty() {
|
||||||
|
// remove the documents using the appropriate IndexEvent
|
||||||
|
let event_bytes = WriteIndexEvent::RemovedDocuments(&removed_documents).into_bytes();
|
||||||
|
self.batch.merge(DATA_INDEX, &event_bytes)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update the documents using the appropriate IndexEvent
|
||||||
|
let event_bytes = WriteIndexEvent::UpdatedDocuments(&index).into_bytes();
|
||||||
|
self.batch.merge(DATA_INDEX, &event_bytes)?;
|
||||||
|
|
||||||
|
// === ranked map ===
|
||||||
|
|
||||||
|
if !removed_documents.is_empty() {
|
||||||
|
// update the ranked map using the appropriate RankedMapEvent
|
||||||
|
let event_bytes = WriteRankedMapEvent::RemovedDocuments(&removed_documents).into_bytes();
|
||||||
|
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update the documents using the appropriate IndexEvent
|
||||||
|
let event_bytes = WriteRankedMapEvent::UpdatedDocuments(&self.documents_ranked_fields).into_bytes();
|
||||||
|
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
|
||||||
|
|
||||||
|
Ok(self.batch)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DocumentUpdate<'a> {
|
||||||
|
document_id: DocumentId,
|
||||||
|
inner: &'a mut RawUpdateBuilder,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> DocumentUpdate<'a> {
|
||||||
|
pub fn remove(&mut self) -> Result<(), SerializerError> {
|
||||||
|
use serde::ser::Error;
|
||||||
|
|
||||||
|
if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) {
|
||||||
|
return Err(SerializerError::custom(
|
||||||
|
"This document has already been updated and cannot be removed in the same update"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let start = DocumentKey::new(self.document_id).with_attribute_min();
|
||||||
|
let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1
|
||||||
|
self.inner.batch.delete_range(start.as_ref(), end.as_ref())?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> {
|
||||||
|
use serde::ser::Error;
|
||||||
|
|
||||||
|
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||||
|
return Err(SerializerError::custom(
|
||||||
|
"This document has already been deleted and cannot be updated in the same update"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||||
|
self.inner.batch.put(key.as_ref(), &value)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> {
|
||||||
|
use serde::ser::Error;
|
||||||
|
|
||||||
|
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||||
|
return Err(SerializerError::custom(
|
||||||
|
"This document has already been deleted and cannot be updated in the same update"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn register_ranked_attribute(
|
||||||
|
&mut self,
|
||||||
|
attr: SchemaAttr,
|
||||||
|
number: Number,
|
||||||
|
) -> Result<(), SerializerError>
|
||||||
|
{
|
||||||
|
use serde::ser::Error;
|
||||||
|
|
||||||
|
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||||
|
return Err(SerializerError::custom(
|
||||||
|
"This document has already been deleted, ranked attributes cannot be added in the same update"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
self.inner.documents_ranked_fields.insert((self.document_id, attr), number);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
58
src/database/update/ranked_map_event.rs
Normal file
58
src/database/update/ranked_map_event.rs
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||||
|
|
||||||
|
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||||
|
use crate::write_to_bytes::WriteToBytes;
|
||||||
|
use crate::database::RankedMap;
|
||||||
|
use crate::data::DocIds;
|
||||||
|
|
||||||
|
pub enum WriteRankedMapEvent<'a> {
|
||||||
|
RemovedDocuments(&'a DocIds),
|
||||||
|
UpdatedDocuments(&'a RankedMap),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> WriteToBytes for WriteRankedMapEvent<'a> {
|
||||||
|
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||||
|
match self {
|
||||||
|
WriteRankedMapEvent::RemovedDocuments(doc_ids) => {
|
||||||
|
let _ = bytes.write_u8(0);
|
||||||
|
doc_ids.write_to_bytes(bytes);
|
||||||
|
},
|
||||||
|
WriteRankedMapEvent::UpdatedDocuments(ranked_map) => {
|
||||||
|
let _ = bytes.write_u8(1);
|
||||||
|
bincode::serialize_into(bytes, ranked_map).unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ReadRankedMapEvent {
|
||||||
|
RemovedDocuments(DocIds),
|
||||||
|
UpdatedDocuments(RankedMap),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReadRankedMapEvent {
|
||||||
|
pub fn updated_documents(self) -> Option<RankedMap> {
|
||||||
|
use ReadRankedMapEvent::*;
|
||||||
|
match self {
|
||||||
|
RemovedDocuments(_) => None,
|
||||||
|
UpdatedDocuments(ranked_map) => Some(ranked_map),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromSharedDataCursor for ReadRankedMapEvent {
|
||||||
|
type Error = Box<Error>;
|
||||||
|
|
||||||
|
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
|
||||||
|
match cursor.read_u8()? {
|
||||||
|
0 => DocIds::from_shared_data_cursor(cursor).map(ReadRankedMapEvent::RemovedDocuments),
|
||||||
|
1 => {
|
||||||
|
let ranked_map = bincode::deserialize_from(cursor)?;
|
||||||
|
Ok(ReadRankedMapEvent::UpdatedDocuments(ranked_map))
|
||||||
|
},
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,168 +0,0 @@
|
|||||||
use std::collections::btree_map::{BTreeMap, Entry};
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::error::Error;
|
|
||||||
|
|
||||||
use rocksdb::rocksdb_options;
|
|
||||||
use hashbrown::HashMap;
|
|
||||||
use fst::map::Map;
|
|
||||||
use sdset::Set;
|
|
||||||
|
|
||||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
|
||||||
use crate::database::{DATA_INDEX, DocumentKeyAttr};
|
|
||||||
use crate::database::schema::SchemaAttr;
|
|
||||||
use crate::data::{DocIds, DocIndexes};
|
|
||||||
use crate::{DocumentId, DocIndex};
|
|
||||||
use super::Update;
|
|
||||||
|
|
||||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
|
||||||
type Value = Vec<u8>;
|
|
||||||
|
|
||||||
pub struct RawUpdateBuilder {
|
|
||||||
sst_file: PathBuf,
|
|
||||||
document_updates: BTreeMap<DocumentId, DocumentUpdate>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct DocumentUpdate {
|
|
||||||
cleared: bool,
|
|
||||||
words_indexes: HashMap<Token, Vec<DocIndex>>,
|
|
||||||
attributes: BTreeMap<SchemaAttr, Value>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DocumentUpdate {
|
|
||||||
pub fn new() -> DocumentUpdate {
|
|
||||||
DocumentUpdate {
|
|
||||||
cleared: false,
|
|
||||||
words_indexes: HashMap::new(),
|
|
||||||
attributes: BTreeMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn remove(&mut self) {
|
|
||||||
self.cleared = true;
|
|
||||||
self.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn clear(&mut self) {
|
|
||||||
self.words_indexes.clear();
|
|
||||||
self.attributes.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: Vec<u8>) {
|
|
||||||
self.attributes.insert(attr, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
|
|
||||||
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RawUpdateBuilder {
|
|
||||||
pub fn new(path: PathBuf) -> RawUpdateBuilder {
|
|
||||||
RawUpdateBuilder {
|
|
||||||
sst_file: path,
|
|
||||||
document_updates: BTreeMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate {
|
|
||||||
match self.document_updates.entry(document_id) {
|
|
||||||
Entry::Occupied(mut occupied) => {
|
|
||||||
occupied.get_mut().clear();
|
|
||||||
occupied.into_mut()
|
|
||||||
},
|
|
||||||
Entry::Vacant(vacant) => vacant.insert(DocumentUpdate::new()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(mut self) -> Result<Update, Box<Error>> {
|
|
||||||
let mut removed_document_ids = Vec::new();
|
|
||||||
let mut words_indexes = BTreeMap::new();
|
|
||||||
|
|
||||||
for (&id, update) in self.document_updates.iter_mut() {
|
|
||||||
if update.cleared { removed_document_ids.push(id) }
|
|
||||||
|
|
||||||
for (token, indexes) in &update.words_indexes {
|
|
||||||
words_indexes.entry(token).or_insert_with(Vec::new).extend_from_slice(indexes)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let negative = {
|
|
||||||
let removed_document_ids = Set::new_unchecked(&removed_document_ids);
|
|
||||||
let doc_ids = DocIds::new(removed_document_ids);
|
|
||||||
Negative::new(doc_ids)
|
|
||||||
};
|
|
||||||
|
|
||||||
let positive = {
|
|
||||||
let mut positive_builder = PositiveBuilder::memory();
|
|
||||||
|
|
||||||
for (key, mut indexes) in words_indexes {
|
|
||||||
indexes.sort_unstable();
|
|
||||||
let indexes = Set::new_unchecked(&indexes);
|
|
||||||
positive_builder.insert(key, indexes)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let (map, indexes) = positive_builder.into_inner()?;
|
|
||||||
let map = Map::from_bytes(map)?;
|
|
||||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
|
||||||
Positive::new(map, indexes)
|
|
||||||
};
|
|
||||||
|
|
||||||
let index = Index { negative, positive };
|
|
||||||
|
|
||||||
let env_options = rocksdb_options::EnvOptions::new();
|
|
||||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
|
||||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
|
||||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
|
||||||
|
|
||||||
// write the data-index
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
index.write_to_bytes(&mut bytes);
|
|
||||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
|
||||||
|
|
||||||
// write all the documents attributes updates
|
|
||||||
for (id, update) in self.document_updates {
|
|
||||||
|
|
||||||
let mut last_attr: Option<SchemaAttr> = None;
|
|
||||||
for (attr, value) in update.attributes {
|
|
||||||
|
|
||||||
if update.cleared {
|
|
||||||
// if there is no last attribute, remove from the first attribute
|
|
||||||
let start_attr = match last_attr {
|
|
||||||
Some(attr) => attr.next(),
|
|
||||||
None => Some(SchemaAttr::min())
|
|
||||||
};
|
|
||||||
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
|
|
||||||
let end = attr.prev().map(|a| DocumentKeyAttr::new(id, a));
|
|
||||||
|
|
||||||
// delete_range between (last_attr + 1) and (attr - 1)
|
|
||||||
if let (Some(start), Some(end)) = (start, end) {
|
|
||||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let key = DocumentKeyAttr::new(id, attr);
|
|
||||||
file_writer.put(key.as_ref(), &value)?;
|
|
||||||
last_attr = Some(attr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if update.cleared {
|
|
||||||
// if there is no last attribute, remove from the first attribute
|
|
||||||
let start_attr = match last_attr {
|
|
||||||
Some(attr) => attr.next(),
|
|
||||||
None => Some(SchemaAttr::min())
|
|
||||||
};
|
|
||||||
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
|
|
||||||
let end = DocumentKeyAttr::with_attribute_max(id);
|
|
||||||
|
|
||||||
// delete_range between (last_attr + 1) and attr_max
|
|
||||||
if let Some(start) = start {
|
|
||||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
file_writer.finish()?;
|
|
||||||
|
|
||||||
Ok(Update { sst_file: self.sst_file })
|
|
||||||
}
|
|
||||||
}
|
|
@ -7,12 +7,14 @@ use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
|
|||||||
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
|
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
|
||||||
use serde::de::DeserializeOwned;
|
use serde::de::DeserializeOwned;
|
||||||
|
|
||||||
|
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
|
||||||
|
use crate::database::serde::deserializer::Deserializer;
|
||||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||||
use crate::database::{retrieve_data_schema, retrieve_data_index};
|
use crate::rank::{QueryBuilder, FilterFunc};
|
||||||
use crate::database::deserializer::Deserializer;
|
|
||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use crate::database::index::Index;
|
use crate::database::index::Index;
|
||||||
use crate::rank::{QueryBuilder, FilterFunc};
|
use crate::database::RankedMap;
|
||||||
|
use crate::database::Config;
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
|
|
||||||
pub struct DatabaseView<D>
|
pub struct DatabaseView<D>
|
||||||
@ -20,7 +22,9 @@ where D: Deref<Target=DB>
|
|||||||
{
|
{
|
||||||
snapshot: Snapshot<D>,
|
snapshot: Snapshot<D>,
|
||||||
index: Index,
|
index: Index,
|
||||||
|
ranked_map: RankedMap,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
|
config: Config,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<D> DatabaseView<D>
|
impl<D> DatabaseView<D>
|
||||||
@ -29,7 +33,9 @@ where D: Deref<Target=DB>
|
|||||||
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
|
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
|
||||||
let schema = retrieve_data_schema(&snapshot)?;
|
let schema = retrieve_data_schema(&snapshot)?;
|
||||||
let index = retrieve_data_index(&snapshot)?;
|
let index = retrieve_data_index(&snapshot)?;
|
||||||
Ok(DatabaseView { snapshot, index, schema })
|
let ranked_map = retrieve_data_ranked_map(&snapshot)?;
|
||||||
|
let config = retrieve_config(&snapshot)?;
|
||||||
|
Ok(DatabaseView { snapshot, index, ranked_map, schema, config })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn schema(&self) -> &Schema {
|
pub fn schema(&self) -> &Schema {
|
||||||
@ -40,6 +46,10 @@ where D: Deref<Target=DB>
|
|||||||
&self.index
|
&self.index
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn ranked_map(&self) -> &RankedMap {
|
||||||
|
&self.ranked_map
|
||||||
|
}
|
||||||
|
|
||||||
pub fn into_snapshot(self) -> Snapshot<D> {
|
pub fn into_snapshot(self) -> Snapshot<D> {
|
||||||
self.snapshot
|
self.snapshot
|
||||||
}
|
}
|
||||||
@ -48,6 +58,10 @@ where D: Deref<Target=DB>
|
|||||||
&self.snapshot
|
&self.snapshot
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn config(&self) -> &Config {
|
||||||
|
&self.config
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
|
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
|
||||||
Ok(self.snapshot.get(key)?)
|
Ok(self.snapshot.get(key)?)
|
||||||
}
|
}
|
||||||
@ -71,12 +85,25 @@ where D: Deref<Target=DB>
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
|
pub fn query_builder(&self) -> QueryBuilder<FilterFunc> {
|
||||||
QueryBuilder::new(self)
|
QueryBuilder::new(self.index())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn raw_field_by_document_id(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
id: DocumentId
|
||||||
|
) -> Result<Option<Vec<u8>>, Box<Error>>
|
||||||
|
{
|
||||||
|
let attr = self.schema.attribute(name).ok_or("field not found")?;
|
||||||
|
let key = DocumentKeyAttr::new(id, attr);
|
||||||
|
let vector = self.snapshot.get(key.as_ref())?;
|
||||||
|
|
||||||
|
Ok(vector.map(|v| v.to_vec()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
|
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
|
||||||
where T: DeserializeOwned
|
where T: DeserializeOwned,
|
||||||
{
|
{
|
||||||
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
|
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
|
||||||
Ok(T::deserialize(&mut deserializer)?)
|
Ok(T::deserialize(&mut deserializer)?)
|
||||||
|
47
src/lib.rs
47
src/lib.rs
@ -5,21 +5,34 @@ pub mod database;
|
|||||||
pub mod data;
|
pub mod data;
|
||||||
pub mod rank;
|
pub mod rank;
|
||||||
pub mod tokenizer;
|
pub mod tokenizer;
|
||||||
mod attribute;
|
|
||||||
mod word_area;
|
|
||||||
mod common_words;
|
mod common_words;
|
||||||
|
mod shared_data_cursor;
|
||||||
|
mod write_to_bytes;
|
||||||
|
|
||||||
|
use serde_derive::{Serialize, Deserialize};
|
||||||
|
|
||||||
pub use rocksdb;
|
pub use rocksdb;
|
||||||
|
|
||||||
pub use self::tokenizer::Tokenizer;
|
pub use self::tokenizer::Tokenizer;
|
||||||
pub use self::common_words::CommonWords;
|
pub use self::common_words::CommonWords;
|
||||||
pub use self::attribute::{Attribute, AttributeError};
|
|
||||||
pub use self::word_area::{WordArea, WordAreaError};
|
pub fn is_cjk(c: char) -> bool {
|
||||||
|
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||||
|
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||||
|
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||||
|
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||||
|
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||||
|
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||||
|
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||||
|
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||||
|
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||||
|
}
|
||||||
|
|
||||||
/// Represent an internally generated document unique identifier.
|
/// Represent an internally generated document unique identifier.
|
||||||
///
|
///
|
||||||
/// It is used to inform the database the document you want to deserialize.
|
/// It is used to inform the database the document you want to deserialize.
|
||||||
/// Helpful for custom ranking.
|
/// Helpful for custom ranking.
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||||
pub struct DocumentId(u64);
|
pub struct DocumentId(u64);
|
||||||
|
|
||||||
@ -36,14 +49,16 @@ pub struct DocIndex {
|
|||||||
|
|
||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
pub attribute: Attribute,
|
pub attribute: u16,
|
||||||
|
pub word_index: u32,
|
||||||
|
|
||||||
/// The position in bytes where the word was found
|
/// The position in bytes where the word was found
|
||||||
/// along with the length of it.
|
/// along with the length of it.
|
||||||
///
|
///
|
||||||
/// It informs on the original word area in the text indexed
|
/// It informs on the original word area in the text indexed
|
||||||
/// without needing to run the tokenizer again.
|
/// without needing to run the tokenizer again.
|
||||||
pub word_area: WordArea,
|
pub char_index: u32,
|
||||||
|
pub char_length: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This structure represent a matching word with informations
|
/// This structure represent a matching word with informations
|
||||||
@ -68,7 +83,8 @@ pub struct Match {
|
|||||||
|
|
||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
pub attribute: Attribute,
|
pub attribute: u16,
|
||||||
|
pub word_index: u32,
|
||||||
|
|
||||||
/// Whether the word that match is an exact match or a prefix.
|
/// Whether the word that match is an exact match or a prefix.
|
||||||
pub is_exact: bool,
|
pub is_exact: bool,
|
||||||
@ -78,7 +94,8 @@ pub struct Match {
|
|||||||
///
|
///
|
||||||
/// It informs on the original word area in the text indexed
|
/// It informs on the original word area in the text indexed
|
||||||
/// without needing to run the tokenizer again.
|
/// without needing to run the tokenizer again.
|
||||||
pub word_area: WordArea,
|
pub char_index: u32,
|
||||||
|
pub char_length: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Match {
|
impl Match {
|
||||||
@ -86,9 +103,11 @@ impl Match {
|
|||||||
Match {
|
Match {
|
||||||
query_index: 0,
|
query_index: 0,
|
||||||
distance: 0,
|
distance: 0,
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
attribute: 0,
|
||||||
|
word_index: 0,
|
||||||
is_exact: false,
|
is_exact: false,
|
||||||
word_area: WordArea::new_faillible(0, 0),
|
char_index: 0,
|
||||||
|
char_length: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,9 +115,11 @@ impl Match {
|
|||||||
Match {
|
Match {
|
||||||
query_index: u32::max_value(),
|
query_index: u32::max_value(),
|
||||||
distance: u8::max_value(),
|
distance: u8::max_value(),
|
||||||
attribute: Attribute::max_value(),
|
attribute: u16::max_value(),
|
||||||
|
word_index: u32::max_value(),
|
||||||
is_exact: true,
|
is_exact: true,
|
||||||
word_area: WordArea::max_value(),
|
char_index: u32::max_value(),
|
||||||
|
char_length: u16::max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -110,6 +131,6 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn docindex_mem_size() {
|
fn docindex_mem_size() {
|
||||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,19 +1,13 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::rank::Document;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct DocumentId;
|
pub struct DocumentId;
|
||||||
|
|
||||||
impl<D> Criterion<D> for DocumentId
|
impl Criterion for DocumentId {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
|
||||||
lhs.id.cmp(&rhs.id)
|
lhs.id.cmp(&rhs.id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,33 +1,40 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
use slice_group_by::GroupBy;
|
||||||
use group_by::GroupBy;
|
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn contains_exact(matches: &&[Match]) -> bool {
|
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
||||||
matches.iter().any(|m| m.is_exact)
|
let mut count = 0;
|
||||||
|
let mut index = 0;
|
||||||
|
|
||||||
|
for group in query_index.linear_group() {
|
||||||
|
let len = group.len();
|
||||||
|
count += is_exact[index..index + len].contains(&true) as usize;
|
||||||
|
index += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
count
|
||||||
fn number_exact_matches(matches: &[Match]) -> usize {
|
|
||||||
GroupBy::new(matches, match_query_index).filter(contains_exact).count()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct Exact;
|
pub struct Exact;
|
||||||
|
|
||||||
impl<D> Criterion<D> for Exact
|
impl Criterion for Exact {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = number_exact_matches(&lhs.matches);
|
let is_exact = lhs.is_exact();
|
||||||
let rhs = number_exact_matches(&rhs.matches);
|
number_exact_matches(query_index, is_exact)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let is_exact = rhs.is_exact();
|
||||||
|
number_exact_matches(query_index, is_exact)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs).reverse()
|
lhs.cmp(&rhs).reverse()
|
||||||
}
|
}
|
||||||
|
@ -4,16 +4,11 @@ mod words_proximity;
|
|||||||
mod sum_of_words_attribute;
|
mod sum_of_words_attribute;
|
||||||
mod sum_of_words_position;
|
mod sum_of_words_position;
|
||||||
mod exact;
|
mod exact;
|
||||||
mod sort_by;
|
mod sort_by_attr;
|
||||||
mod document_id;
|
mod document_id;
|
||||||
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
|
|
||||||
use crate::database::DatabaseView;
|
|
||||||
use crate::rank::Document;
|
|
||||||
|
|
||||||
pub use self::{
|
pub use self::{
|
||||||
sum_of_typos::SumOfTypos,
|
sum_of_typos::SumOfTypos,
|
||||||
@ -22,60 +17,51 @@ pub use self::{
|
|||||||
sum_of_words_attribute::SumOfWordsAttribute,
|
sum_of_words_attribute::SumOfWordsAttribute,
|
||||||
sum_of_words_position::SumOfWordsPosition,
|
sum_of_words_position::SumOfWordsPosition,
|
||||||
exact::Exact,
|
exact::Exact,
|
||||||
sort_by::SortBy,
|
sort_by_attr::SortByAttr,
|
||||||
document_id::DocumentId,
|
document_id::DocumentId,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub trait Criterion<D>
|
pub trait Criterion: Send + Sync {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||||
{
|
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||||
self.evaluate(lhs, rhs, view) == Ordering::Equal
|
self.evaluate(lhs, rhs) == Ordering::Equal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, D, T: Criterion<D> + ?Sized> Criterion<D> for &'a T
|
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
(**self).evaluate(lhs, rhs)
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
|
||||||
(**self).evaluate(lhs, rhs, view)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||||
(**self).eq(lhs, rhs, view)
|
(**self).eq(lhs, rhs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<D, T: Criterion<D> + ?Sized> Criterion<D> for Box<T>
|
impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
(**self).evaluate(lhs, rhs)
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
|
||||||
(**self).evaluate(lhs, rhs, view)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||||
(**self).eq(lhs, rhs, view)
|
(**self).eq(lhs, rhs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct CriteriaBuilder<D>
|
pub struct CriteriaBuilder<'a> {
|
||||||
where D: Deref<Target=DB>
|
inner: Vec<Box<dyn Criterion + 'a>>
|
||||||
{
|
|
||||||
inner: Vec<Box<dyn Criterion<D>>>
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<D> CriteriaBuilder<D>
|
impl<'a> CriteriaBuilder<'a>
|
||||||
where D: Deref<Target=DB>
|
|
||||||
{
|
{
|
||||||
pub fn new() -> CriteriaBuilder<D> {
|
pub fn new() -> CriteriaBuilder<'a> {
|
||||||
CriteriaBuilder { inner: Vec::new() }
|
CriteriaBuilder { inner: Vec::new() }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<D> {
|
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
||||||
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
|
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,33 +69,29 @@ where D: Deref<Target=DB>
|
|||||||
self.inner.reserve(additional)
|
self.inner.reserve(additional)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder<D>
|
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
||||||
where C: 'static + Criterion<D>,
|
where C: Criterion,
|
||||||
{
|
{
|
||||||
self.push(criterion);
|
self.push(criterion);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn push<C>(&mut self, criterion: C)
|
pub fn push<C: 'a>(&mut self, criterion: C)
|
||||||
where C: 'static + Criterion<D>,
|
where C: Criterion,
|
||||||
{
|
{
|
||||||
self.inner.push(Box::new(criterion));
|
self.inner.push(Box::new(criterion));
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build(self) -> Criteria<D> {
|
pub fn build(self) -> Criteria<'a> {
|
||||||
Criteria { inner: self.inner }
|
Criteria { inner: self.inner }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Criteria<D>
|
pub struct Criteria<'a> {
|
||||||
where D: Deref<Target=DB>
|
inner: Vec<Box<dyn Criterion + 'a>>,
|
||||||
{
|
|
||||||
inner: Vec<Box<dyn Criterion<D>>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<D> Default for Criteria<D>
|
impl<'a> Default for Criteria<'a> {
|
||||||
where D: Deref<Target=DB>
|
|
||||||
{
|
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
CriteriaBuilder::with_capacity(7)
|
CriteriaBuilder::with_capacity(7)
|
||||||
.add(SumOfTypos)
|
.add(SumOfTypos)
|
||||||
@ -123,10 +105,8 @@ where D: Deref<Target=DB>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<D> AsRef<[Box<dyn Criterion<D>>]> for Criteria<D>
|
impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> {
|
||||||
where D: Deref<Target=DB>
|
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
|
||||||
{
|
|
||||||
fn as_ref(&self) -> &[Box<dyn Criterion<D>>] {
|
|
||||||
&self.inner
|
&self.inner
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,28 +1,28 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
use slice_group_by::GroupBy;
|
||||||
use group_by::GroupBy;
|
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||||
GroupBy::new(matches, match_query_index).count()
|
query_index.linear_group().count()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct NumberOfWords;
|
pub struct NumberOfWords;
|
||||||
|
|
||||||
impl<D> Criterion<D> for NumberOfWords
|
impl Criterion for NumberOfWords {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = number_of_query_words(&lhs.matches);
|
number_of_query_words(query_index)
|
||||||
let rhs = number_of_query_words(&rhs.matches);
|
};
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
number_of_query_words(query_index)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs).reverse()
|
lhs.cmp(&rhs).reverse()
|
||||||
}
|
}
|
||||||
|
@ -1,82 +0,0 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
use std::ops::Deref;
|
|
||||||
use std::marker;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
|
||||||
use serde::de::DeserializeOwned;
|
|
||||||
|
|
||||||
use crate::rank::criterion::Criterion;
|
|
||||||
use crate::database::DatabaseView;
|
|
||||||
use crate::rank::Document;
|
|
||||||
|
|
||||||
/// An helper struct that permit to sort documents by
|
|
||||||
/// some of their stored attributes.
|
|
||||||
///
|
|
||||||
/// # Note
|
|
||||||
///
|
|
||||||
/// If a document cannot be deserialized it will be considered [`None`][].
|
|
||||||
///
|
|
||||||
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
|
|
||||||
/// so you must check the [`Ord`] of `Option` implementation.
|
|
||||||
///
|
|
||||||
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
|
|
||||||
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
|
|
||||||
///
|
|
||||||
/// # Example
|
|
||||||
///
|
|
||||||
/// ```no-test
|
|
||||||
/// use serde_derive::Deserialize;
|
|
||||||
/// use meilidb::rank::criterion::*;
|
|
||||||
///
|
|
||||||
/// #[derive(Deserialize, PartialOrd, Ord, PartialEq, Eq)]
|
|
||||||
/// struct TimeOnly {
|
|
||||||
/// time: String,
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
|
||||||
/// .add(SumOfTypos)
|
|
||||||
/// .add(NumberOfWords)
|
|
||||||
/// .add(WordsProximity)
|
|
||||||
/// .add(SumOfWordsAttribute)
|
|
||||||
/// .add(SumOfWordsPosition)
|
|
||||||
/// .add(Exact)
|
|
||||||
/// .add(SortBy::<TimeOnly>::new())
|
|
||||||
/// .add(DocumentId);
|
|
||||||
///
|
|
||||||
/// let criterion = builder.build();
|
|
||||||
///
|
|
||||||
/// ```
|
|
||||||
pub struct SortBy<T> {
|
|
||||||
_phantom: marker::PhantomData<T>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> SortBy<T> {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
SortBy::default()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Default for SortBy<T> {
|
|
||||||
fn default() -> SortBy<T> {
|
|
||||||
SortBy { _phantom: marker::PhantomData }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T, D> Criterion<D> for SortBy<T>
|
|
||||||
where D: Deref<Target=DB>,
|
|
||||||
T: DeserializeOwned + Ord,
|
|
||||||
{
|
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
|
||||||
let lhs = match view.document_by_id::<T>(lhs.id) {
|
|
||||||
Ok(doc) => Some(doc),
|
|
||||||
Err(e) => { eprintln!("{}", e); None },
|
|
||||||
};
|
|
||||||
|
|
||||||
let rhs = match view.document_by_id::<T>(rhs.id) {
|
|
||||||
Ok(doc) => Some(doc),
|
|
||||||
Err(e) => { eprintln!("{}", e); None },
|
|
||||||
};
|
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
|
||||||
}
|
|
||||||
}
|
|
122
src/rank/criterion/sort_by_attr.rs
Normal file
122
src/rank/criterion/sort_by_attr.rs
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
use crate::database::schema::{Schema, SchemaAttr};
|
||||||
|
use crate::rank::criterion::Criterion;
|
||||||
|
use crate::database::RankedMap;
|
||||||
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
|
/// An helper struct that permit to sort documents by
|
||||||
|
/// some of their stored attributes.
|
||||||
|
///
|
||||||
|
/// # Note
|
||||||
|
///
|
||||||
|
/// If a document cannot be deserialized it will be considered [`None`][].
|
||||||
|
///
|
||||||
|
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
|
||||||
|
/// so you must check the [`Ord`] of `Option` implementation.
|
||||||
|
///
|
||||||
|
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
|
||||||
|
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```ignore
|
||||||
|
/// use serde_derive::Deserialize;
|
||||||
|
/// use meilidb::rank::criterion::*;
|
||||||
|
///
|
||||||
|
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
|
||||||
|
///
|
||||||
|
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||||
|
/// .add(SumOfTypos)
|
||||||
|
/// .add(NumberOfWords)
|
||||||
|
/// .add(WordsProximity)
|
||||||
|
/// .add(SumOfWordsAttribute)
|
||||||
|
/// .add(SumOfWordsPosition)
|
||||||
|
/// .add(Exact)
|
||||||
|
/// .add(custom_ranking)
|
||||||
|
/// .add(DocumentId);
|
||||||
|
///
|
||||||
|
/// let criterion = builder.build();
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
pub struct SortByAttr<'a> {
|
||||||
|
ranked_map: &'a RankedMap,
|
||||||
|
attr: SchemaAttr,
|
||||||
|
reversed: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> SortByAttr<'a> {
|
||||||
|
pub fn lower_is_better(
|
||||||
|
ranked_map: &'a RankedMap,
|
||||||
|
schema: &Schema,
|
||||||
|
attr_name: &str,
|
||||||
|
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||||
|
{
|
||||||
|
SortByAttr::new(ranked_map, schema, attr_name, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn higher_is_better(
|
||||||
|
ranked_map: &'a RankedMap,
|
||||||
|
schema: &Schema,
|
||||||
|
attr_name: &str,
|
||||||
|
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||||
|
{
|
||||||
|
SortByAttr::new(ranked_map, schema, attr_name, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new(
|
||||||
|
ranked_map: &'a RankedMap,
|
||||||
|
schema: &Schema,
|
||||||
|
attr_name: &str,
|
||||||
|
reversed: bool,
|
||||||
|
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||||
|
{
|
||||||
|
let attr = match schema.attribute(attr_name) {
|
||||||
|
Some(attr) => attr,
|
||||||
|
None => return Err(SortByAttrError::AttributeNotFound),
|
||||||
|
};
|
||||||
|
|
||||||
|
if !schema.props(attr).is_ranked() {
|
||||||
|
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(SortByAttr { ranked_map, attr, reversed })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Criterion for SortByAttr<'a> {
|
||||||
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
|
let lhs = self.ranked_map.get(&(lhs.id, self.attr));
|
||||||
|
let rhs = self.ranked_map.get(&(rhs.id, self.attr));
|
||||||
|
|
||||||
|
match (lhs, rhs) {
|
||||||
|
(Some(lhs), Some(rhs)) => {
|
||||||
|
let order = lhs.cmp(&rhs);
|
||||||
|
if self.reversed { order.reverse() } else { order }
|
||||||
|
},
|
||||||
|
(None, Some(_)) => Ordering::Greater,
|
||||||
|
(Some(_), None) => Ordering::Less,
|
||||||
|
(None, None) => Ordering::Equal,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum SortByAttrError {
|
||||||
|
AttributeNotFound,
|
||||||
|
AttributeNotRegisteredForRanking,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for SortByAttrError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
use SortByAttrError::*;
|
||||||
|
match self {
|
||||||
|
AttributeNotFound => f.write_str("attribute not found in the schema"),
|
||||||
|
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Error for SortByAttrError { }
|
@ -1,106 +1,79 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use group_by::GroupBy;
|
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
|
// This function is a wrong logarithmic 10 function.
|
||||||
|
// It is safe to panic on input number higher than 3,
|
||||||
|
// the number of typos is never bigger than that.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_typos(matches: &[Match]) -> isize {
|
fn custom_log10(n: u8) -> f32 {
|
||||||
let mut sum_typos = 0;
|
match n {
|
||||||
let mut number_words = 0;
|
0 => 0.0, // log(1)
|
||||||
|
1 => 0.30102, // log(2)
|
||||||
// note that GroupBy will never return an empty group
|
2 => 0.47712, // log(3)
|
||||||
// so we can do this assumption safely
|
3 => 0.60205, // log(4)
|
||||||
for group in GroupBy::new(matches, match_query_index) {
|
_ => panic!("invalid number"),
|
||||||
sum_typos += unsafe { group.get_unchecked(0).distance as isize };
|
}
|
||||||
number_words += 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sum_typos - number_words
|
#[inline]
|
||||||
|
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||||
|
let mut number_words = 0;
|
||||||
|
let mut sum_typos = 0.0;
|
||||||
|
let mut index = 0;
|
||||||
|
|
||||||
|
for group in query_index.linear_group() {
|
||||||
|
sum_typos += custom_log10(distance[index]);
|
||||||
|
number_words += 1;
|
||||||
|
index += group.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SumOfTypos;
|
pub struct SumOfTypos;
|
||||||
|
|
||||||
impl<D> Criterion<D> for SumOfTypos
|
impl Criterion for SumOfTypos {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = sum_matches_typos(&lhs.matches);
|
let distance = lhs.distance();
|
||||||
let rhs = sum_matches_typos(&rhs.matches);
|
sum_matches_typos(query_index, distance)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let distance = rhs.distance();
|
||||||
|
sum_matches_typos(query_index, distance)
|
||||||
|
};
|
||||||
|
|
||||||
|
lhs.cmp(&rhs).reverse()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
use crate::{DocumentId, Attribute, WordArea};
|
|
||||||
|
|
||||||
// typing: "Geox CEO"
|
// typing: "Geox CEO"
|
||||||
//
|
//
|
||||||
// doc0: "Geox SpA: CEO and Executive"
|
// doc0: "Geox SpA: CEO and Executive"
|
||||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||||
#[test]
|
#[test]
|
||||||
fn one_typo_reference() {
|
fn one_typo_reference() {
|
||||||
let doc0 = {
|
let query_index0 = &[0, 1];
|
||||||
let matches = vec![
|
let distance0 = &[0, 0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 2),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(0),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc1 = {
|
let query_index1 = &[0, 1];
|
||||||
let matches = vec![
|
let distance1 = &[1, 0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 1,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 2),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(1),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let lhs = sum_matches_typos(&doc0.matches);
|
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||||
let rhs = sum_matches_typos(&doc1.matches);
|
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||||
}
|
}
|
||||||
|
|
||||||
// typing: "bouton manchette"
|
// typing: "bouton manchette"
|
||||||
@ -109,48 +82,15 @@ mod tests {
|
|||||||
// doc1: "bouton"
|
// doc1: "bouton"
|
||||||
#[test]
|
#[test]
|
||||||
fn no_typo() {
|
fn no_typo() {
|
||||||
let doc0 = {
|
let query_index0 = &[0, 1];
|
||||||
let matches = vec![
|
let distance0 = &[0, 0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 1),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(0),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc1 = {
|
let query_index1 = &[0];
|
||||||
let matches = vec![
|
let distance1 = &[0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(1),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let lhs = sum_matches_typos(&doc0.matches);
|
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||||
let rhs = sum_matches_typos(&doc1.matches);
|
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||||
}
|
}
|
||||||
|
|
||||||
// typing: "bouton manchztte"
|
// typing: "bouton manchztte"
|
||||||
@ -159,47 +99,14 @@ mod tests {
|
|||||||
// doc1: "bouton"
|
// doc1: "bouton"
|
||||||
#[test]
|
#[test]
|
||||||
fn one_typo() {
|
fn one_typo() {
|
||||||
let doc0 = {
|
let query_index0 = &[0, 1];
|
||||||
let matches = vec![
|
let distance0 = &[0, 1];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
Match {
|
|
||||||
query_index: 1,
|
|
||||||
distance: 1,
|
|
||||||
attribute: Attribute::new_faillible(0, 1),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(0),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc1 = {
|
let query_index1 = &[0];
|
||||||
let matches = vec![
|
let distance1 = &[0];
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
|
||||||
is_exact: false,
|
|
||||||
word_area: WordArea::new_faillible(0, 6)
|
|
||||||
},
|
|
||||||
];
|
|
||||||
Document {
|
|
||||||
id: DocumentId(1),
|
|
||||||
matches: matches,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let lhs = sum_matches_typos(&doc0.matches);
|
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||||
let rhs = sum_matches_typos(&doc1.matches);
|
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||||
assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,32 +1,39 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
use slice_group_by::GroupBy;
|
||||||
use group_by::GroupBy;
|
|
||||||
|
|
||||||
use crate::database::DatabaseView;
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::Match;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attributes(matches: &[Match]) -> usize {
|
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||||
// note that GroupBy will never return an empty group
|
let mut sum_attributes = 0;
|
||||||
// so we can do this assumption safely
|
let mut index = 0;
|
||||||
GroupBy::new(matches, match_query_index).map(|group| {
|
|
||||||
unsafe { group.get_unchecked(0).attribute.attribute() as usize }
|
for group in query_index.linear_group() {
|
||||||
}).sum()
|
sum_attributes += attribute[index] as usize;
|
||||||
|
index += group.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
sum_attributes
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SumOfWordsAttribute;
|
pub struct SumOfWordsAttribute;
|
||||||
|
|
||||||
impl<D> Criterion<D> for SumOfWordsAttribute
|
impl Criterion for SumOfWordsAttribute {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = sum_matches_attributes(&lhs.matches);
|
let attribute = lhs.attribute();
|
||||||
let rhs = sum_matches_attributes(&rhs.matches);
|
sum_matches_attributes(query_index, attribute)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let attribute = rhs.attribute();
|
||||||
|
sum_matches_attributes(query_index, attribute)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
lhs.cmp(&rhs)
|
||||||
}
|
}
|
||||||
|
@ -1,32 +1,39 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
use slice_group_by::GroupBy;
|
||||||
use group_by::GroupBy;
|
|
||||||
|
|
||||||
use crate::database::DatabaseView;
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::Match;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attribute_index(matches: &[Match]) -> usize {
|
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
|
||||||
// note that GroupBy will never return an empty group
|
let mut sum_word_index = 0;
|
||||||
// so we can do this assumption safely
|
let mut index = 0;
|
||||||
GroupBy::new(matches, match_query_index).map(|group| {
|
|
||||||
unsafe { group.get_unchecked(0).attribute.word_index() as usize }
|
for group in query_index.linear_group() {
|
||||||
}).sum()
|
sum_word_index += word_index[index] as usize;
|
||||||
|
index += group.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
sum_word_index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SumOfWordsPosition;
|
pub struct SumOfWordsPosition;
|
||||||
|
|
||||||
impl<D> Criterion<D> for SumOfWordsPosition
|
impl Criterion for SumOfWordsPosition {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = sum_matches_attribute_index(&lhs.matches);
|
let word_index = lhs.word_index();
|
||||||
let rhs = sum_matches_attribute_index(&rhs.matches);
|
sum_matches_attribute_index(query_index, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let word_index = rhs.word_index();
|
||||||
|
sum_matches_attribute_index(query_index, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
lhs.cmp(&rhs)
|
||||||
}
|
}
|
||||||
|
@ -1,16 +1,17 @@
|
|||||||
use std::cmp::{self, Ordering};
|
use std::cmp::{self, Ordering};
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use rocksdb::DB;
|
use slice_group_by::GroupBy;
|
||||||
use group_by::GroupBy;
|
|
||||||
|
|
||||||
use crate::rank::{match_query_index, Document};
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::database::DatabaseView;
|
use crate::rank::RawDocument;
|
||||||
use crate::Match;
|
|
||||||
|
|
||||||
const MAX_DISTANCE: u32 = 8;
|
const MAX_DISTANCE: u32 = 8;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||||
|
(a.clone(), b.clone())
|
||||||
|
}
|
||||||
|
|
||||||
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||||
if lhs < rhs {
|
if lhs < rhs {
|
||||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||||
@ -19,30 +20,58 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
|
fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
|
||||||
if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
|
if lattr != rattr { return MAX_DISTANCE }
|
||||||
index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
|
index_proximity(lwi, rwi)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
|
fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
|
||||||
let mut min_prox = u32::max_value();
|
let mut min_prox = u32::max_value();
|
||||||
for a in lhs {
|
|
||||||
for b in rhs {
|
for a in lattr.iter().zip(lwi) {
|
||||||
|
for b in rattr.iter().zip(rwi) {
|
||||||
|
let a = clone_tuple(a);
|
||||||
|
let b = clone_tuple(b);
|
||||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
min_prox
|
min_prox
|
||||||
}
|
}
|
||||||
|
|
||||||
fn matches_proximity(matches: &[Match]) -> u32 {
|
fn matches_proximity(
|
||||||
|
query_index: &[u32],
|
||||||
|
distance: &[u8],
|
||||||
|
attribute: &[u16],
|
||||||
|
word_index: &[u32],
|
||||||
|
) -> u32
|
||||||
|
{
|
||||||
|
let mut query_index_groups = query_index.linear_group();
|
||||||
let mut proximity = 0;
|
let mut proximity = 0;
|
||||||
let mut iter = GroupBy::new(matches, match_query_index);
|
let mut index = 0;
|
||||||
|
|
||||||
// iterate over groups by windows of size 2
|
let get_attr_wi = |index: usize, group_len: usize| {
|
||||||
let mut last = iter.next();
|
// retrieve the first distance group (with the lowest values)
|
||||||
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
|
||||||
proximity += min_proximity(lhs, rhs);
|
|
||||||
last = Some(rhs);
|
let rattr = &attribute[index..index + len];
|
||||||
|
let rwi = &word_index[index..index + len];
|
||||||
|
|
||||||
|
(rattr, rwi)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut last = query_index_groups.next().map(|group| {
|
||||||
|
let attr_wi = get_attr_wi(index, group.len());
|
||||||
|
index += group.len();
|
||||||
|
attr_wi
|
||||||
|
});
|
||||||
|
|
||||||
|
// iter by windows of size 2
|
||||||
|
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
|
||||||
|
let attr_wi = get_attr_wi(index, rhs.len());
|
||||||
|
proximity += min_proximity(lhs, attr_wi);
|
||||||
|
last = Some(attr_wi);
|
||||||
|
index += rhs.len();
|
||||||
}
|
}
|
||||||
|
|
||||||
proximity
|
proximity
|
||||||
@ -51,24 +80,32 @@ fn matches_proximity(matches: &[Match]) -> u32 {
|
|||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct WordsProximity;
|
pub struct WordsProximity;
|
||||||
|
|
||||||
impl<D> Criterion<D> for WordsProximity
|
impl Criterion for WordsProximity {
|
||||||
where D: Deref<Target=DB>
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||||
{
|
let lhs = {
|
||||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
let query_index = lhs.query_index();
|
||||||
let lhs = matches_proximity(&lhs.matches);
|
let distance = lhs.distance();
|
||||||
let rhs = matches_proximity(&rhs.matches);
|
let attribute = lhs.attribute();
|
||||||
|
let word_index = lhs.word_index();
|
||||||
|
matches_proximity(query_index, distance, attribute, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
|
let rhs = {
|
||||||
|
let query_index = rhs.query_index();
|
||||||
|
let distance = rhs.distance();
|
||||||
|
let attribute = rhs.attribute();
|
||||||
|
let word_index = rhs.word_index();
|
||||||
|
matches_proximity(query_index, distance, attribute, word_index)
|
||||||
|
};
|
||||||
|
|
||||||
lhs.cmp(&rhs)
|
lhs.cmp(&rhs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
use crate::Attribute;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn three_different_attributes() {
|
fn three_different_attributes() {
|
||||||
|
|
||||||
@ -80,18 +117,15 @@ mod tests {
|
|||||||
// { id: 2, attr: 2, attr_index: 0 }
|
// { id: 2, attr: 2, attr_index: 0 }
|
||||||
// { id: 3, attr: 3, attr_index: 1 }
|
// { id: 3, attr: 3, attr_index: 1 }
|
||||||
|
|
||||||
let matches = &[
|
let query_index = &[0, 1, 2, 2, 3];
|
||||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
let distance = &[0, 0, 0, 0, 0];
|
||||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
let attribute = &[0, 1, 1, 2, 3];
|
||||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
let word_index = &[0, 0, 1, 0, 1];
|
||||||
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
|
|
||||||
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
|
|
||||||
];
|
|
||||||
|
|
||||||
// soup -> of = 8
|
// soup -> of = 8
|
||||||
// + of -> the = 1
|
// + of -> the = 1
|
||||||
// + the -> day = 8 (not 1)
|
// + the -> day = 8 (not 1)
|
||||||
assert_eq!(matches_proximity(matches), 17);
|
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -106,57 +140,14 @@ mod tests {
|
|||||||
// { id: 3, attr: 0, attr_index: 1 }
|
// { id: 3, attr: 0, attr_index: 1 }
|
||||||
// { id: 3, attr: 1, attr_index: 3 }
|
// { id: 3, attr: 1, attr_index: 3 }
|
||||||
|
|
||||||
let matches = &[
|
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
let distance = &[0, 0, 0, 0, 0, 0];
|
||||||
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
|
|
||||||
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
|
|
||||||
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
|
|
||||||
];
|
|
||||||
|
|
||||||
// soup -> of = 1
|
// soup -> of = 1
|
||||||
// + of -> the = 1
|
// + of -> the = 1
|
||||||
// + the -> day = 1
|
// + the -> day = 1
|
||||||
assert_eq!(matches_proximity(matches), 3);
|
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(feature = "nightly", test))]
|
|
||||||
mod bench {
|
|
||||||
extern crate test;
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
use std::error::Error;
|
|
||||||
use self::test::Bencher;
|
|
||||||
|
|
||||||
use rand_xorshift::XorShiftRng;
|
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
|
|
||||||
use crate::Attribute;
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
|
||||||
let number_matches = 30_000;
|
|
||||||
let mut matches = Vec::with_capacity(number_matches);
|
|
||||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
|
||||||
|
|
||||||
for _ in 0..number_matches {
|
|
||||||
let query_index = rng.gen_range(0, 4);
|
|
||||||
|
|
||||||
let attribute = rng.gen_range(0, 5);
|
|
||||||
let word_index = rng.gen_range(0, 15);
|
|
||||||
let attribute = Attribute::new_faillible(attribute, word_index);
|
|
||||||
|
|
||||||
let match_ = Match { query_index, attribute, ..Match::zero() };
|
|
||||||
matches.push(match_);
|
|
||||||
}
|
|
||||||
|
|
||||||
bench.iter(|| {
|
|
||||||
let proximity = matches_proximity(&matches);
|
|
||||||
test::black_box(move || proximity)
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
176
src/rank/mod.rs
176
src/rank/mod.rs
@ -2,32 +2,182 @@ pub mod criterion;
|
|||||||
mod query_builder;
|
mod query_builder;
|
||||||
mod distinct_map;
|
mod distinct_map;
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
use rayon::slice::ParallelSliceMut;
|
||||||
|
|
||||||
use crate::{Match, DocumentId};
|
use crate::{Match, DocumentId};
|
||||||
|
|
||||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||||
|
|
||||||
#[inline]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
fn match_query_index(a: &Match, b: &Match) -> bool {
|
|
||||||
a.query_index == b.query_index
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct Document {
|
pub struct Document {
|
||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
pub matches: Vec<Match>,
|
pub matches: Vec<Match>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Document {
|
impl Document {
|
||||||
pub fn new(doc: DocumentId, match_: Match) -> Self {
|
fn from_raw(raw: &RawDocument) -> Document {
|
||||||
unsafe { Self::from_sorted_matches(doc, vec![match_]) }
|
let len = raw.matches.range.len();
|
||||||
|
let mut matches = Vec::with_capacity(len);
|
||||||
|
|
||||||
|
let query_index = raw.query_index();
|
||||||
|
let distance = raw.distance();
|
||||||
|
let attribute = raw.attribute();
|
||||||
|
let word_index = raw.word_index();
|
||||||
|
let is_exact = raw.is_exact();
|
||||||
|
let char_index = raw.char_index();
|
||||||
|
let char_length = raw.char_length();
|
||||||
|
|
||||||
|
for i in 0..len {
|
||||||
|
let match_ = Match {
|
||||||
|
query_index: query_index[i],
|
||||||
|
distance: distance[i],
|
||||||
|
attribute: attribute[i],
|
||||||
|
word_index: word_index[i],
|
||||||
|
is_exact: is_exact[i],
|
||||||
|
char_index: char_index[i],
|
||||||
|
char_length: char_length[i],
|
||||||
|
};
|
||||||
|
matches.push(match_);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self {
|
Document { id: raw.id, matches }
|
||||||
matches.sort_unstable();
|
}
|
||||||
unsafe { Self::from_sorted_matches(doc, matches) }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self {
|
#[derive(Clone)]
|
||||||
Self { id, matches }
|
pub struct RawDocument {
|
||||||
|
pub id: DocumentId,
|
||||||
|
pub matches: SharedMatches,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RawDocument {
|
||||||
|
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
|
||||||
|
RawDocument { id, matches: SharedMatches { range, matches } }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn query_index(&self) -> &[u32] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn distance(&self) -> &[u8] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn attribute(&self) -> &[u16] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn word_index(&self) -> &[u32] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_exact(&self) -> &[bool] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn char_index(&self) -> &[u32] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn char_length(&self) -> &[u16] {
|
||||||
|
let r = self.matches.range;
|
||||||
|
// it is safe because construction/modifications
|
||||||
|
// can only be done in this module
|
||||||
|
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
|
||||||
|
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
|
||||||
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
|
matches.par_sort_unstable();
|
||||||
|
|
||||||
|
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
|
||||||
|
let id = group[0].0;
|
||||||
|
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
|
||||||
|
let end = start + group.len();
|
||||||
|
docs_ranges.push((id, Range { start, end }));
|
||||||
|
|
||||||
|
matches2.extend_from_slice(group);
|
||||||
|
}
|
||||||
|
|
||||||
|
let matches = Arc::new(matches2);
|
||||||
|
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
struct Range {
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Range {
|
||||||
|
fn len(self) -> usize {
|
||||||
|
self.end - self.start
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SharedMatches {
|
||||||
|
range: Range,
|
||||||
|
matches: Arc<Matches>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct Matches {
|
||||||
|
query_index: Vec<u32>,
|
||||||
|
distance: Vec<u8>,
|
||||||
|
attribute: Vec<u16>,
|
||||||
|
word_index: Vec<u32>,
|
||||||
|
is_exact: Vec<bool>,
|
||||||
|
char_index: Vec<u32>,
|
||||||
|
char_length: Vec<u16>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matches {
|
||||||
|
fn with_capacity(cap: usize) -> Matches {
|
||||||
|
Matches {
|
||||||
|
query_index: Vec::with_capacity(cap),
|
||||||
|
distance: Vec::with_capacity(cap),
|
||||||
|
attribute: Vec::with_capacity(cap),
|
||||||
|
word_index: Vec::with_capacity(cap),
|
||||||
|
is_exact: Vec::with_capacity(cap),
|
||||||
|
char_index: Vec::with_capacity(cap),
|
||||||
|
char_length: Vec::with_capacity(cap),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
|
||||||
|
for (_, match_) in matches {
|
||||||
|
self.query_index.push(match_.query_index);
|
||||||
|
self.distance.push(match_.distance);
|
||||||
|
self.attribute.push(match_.attribute);
|
||||||
|
self.word_index.push(match_.word_index);
|
||||||
|
self.is_exact.push(match_.is_exact);
|
||||||
|
self.char_index.push(match_.char_index);
|
||||||
|
self.char_length.push(match_.char_length);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,30 +1,56 @@
|
|||||||
use std::{cmp, mem, vec, str, char};
|
use std::{cmp, mem};
|
||||||
use std::ops::{Deref, Range};
|
use std::ops::Range;
|
||||||
use std::error::Error;
|
use std::time::Instant;
|
||||||
use std::hash::Hash;
|
use std::hash::Hash;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
|
|
||||||
use group_by::BinaryGroupByMut;
|
use rayon::slice::ParallelSliceMut;
|
||||||
|
use slice_group_by::{GroupByMut, LinearStrGroupBy};
|
||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use rocksdb::DB;
|
|
||||||
use log::info;
|
use log::info;
|
||||||
|
|
||||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||||
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||||
use crate::rank::criterion::Criteria;
|
use crate::rank::criterion::Criteria;
|
||||||
use crate::database::DatabaseView;
|
use crate::database::Index;
|
||||||
use crate::{Match, DocumentId};
|
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
||||||
use crate::rank::Document;
|
use crate::{is_cjk, Match, DocumentId};
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
enum CharCategory {
|
||||||
|
Space,
|
||||||
|
Cjk,
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify_char(c: char) -> CharCategory {
|
||||||
|
if c.is_whitespace() { CharCategory::Space }
|
||||||
|
else if is_cjk(c) { CharCategory::Cjk }
|
||||||
|
else { CharCategory::Other }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_word(s: &&str) -> bool {
|
||||||
|
!s.chars().any(char::is_whitespace)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn same_group_category(a: char, b: char) -> bool {
|
||||||
|
let ca = classify_char(a);
|
||||||
|
let cb = classify_char(b);
|
||||||
|
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
|
||||||
|
}
|
||||||
|
|
||||||
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let mut automatons = Vec::new();
|
let mut groups = LinearStrGroupBy::new(query, same_group_category)
|
||||||
let mut words = query.split_whitespace().map(str::to_lowercase).peekable();
|
.filter(is_word)
|
||||||
|
.map(str::to_lowercase)
|
||||||
|
.peekable();
|
||||||
|
|
||||||
while let Some(word) = words.next() {
|
let mut automatons = Vec::new();
|
||||||
let has_following_word = words.peek().is_some();
|
while let Some(word) = groups.next() {
|
||||||
let lev = if has_following_word || has_end_whitespace {
|
let has_following_word = groups.peek().is_some();
|
||||||
|
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
|
||||||
automaton::build_dfa(&word)
|
automaton::build_dfa(&word)
|
||||||
} else {
|
} else {
|
||||||
automaton::build_prefix_dfa(&word)
|
automaton::build_prefix_dfa(&word)
|
||||||
@ -35,43 +61,38 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
|||||||
automatons
|
automatons
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
|
pub type FilterFunc = fn(DocumentId) -> bool;
|
||||||
|
|
||||||
pub struct QueryBuilder<'a, D, FI>
|
pub struct QueryBuilder<'i, 'c, FI> {
|
||||||
where D: Deref<Target=DB>
|
index: &'i Index,
|
||||||
{
|
criteria: Criteria<'c>,
|
||||||
view: &'a DatabaseView<D>,
|
|
||||||
criteria: Criteria<D>,
|
|
||||||
filter: Option<FI>,
|
filter: Option<FI>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
|
impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> {
|
||||||
where D: Deref<Target=DB>
|
pub fn new(index: &'i Index) -> Self {
|
||||||
{
|
QueryBuilder::with_criteria(index, Criteria::default())
|
||||||
pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
|
}
|
||||||
QueryBuilder::with_criteria(view, Criteria::default())
|
|
||||||
|
pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self {
|
||||||
|
QueryBuilder { index, criteria, filter: None }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
|
||||||
where D: Deref<Target=DB>,
|
|
||||||
{
|
{
|
||||||
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> {
|
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F>
|
||||||
Ok(QueryBuilder { view, criteria, filter: None })
|
where F: Fn(DocumentId) -> bool,
|
||||||
}
|
|
||||||
|
|
||||||
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
|
|
||||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
|
||||||
{
|
{
|
||||||
QueryBuilder {
|
QueryBuilder {
|
||||||
view: self.view,
|
index: self.index,
|
||||||
criteria: self.criteria,
|
criteria: self.criteria,
|
||||||
filter: Some(function)
|
filter: Some(function)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
|
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F>
|
||||||
where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
where F: Fn(DocumentId) -> Option<K>,
|
||||||
K: Hash + Eq,
|
K: Hash + Eq,
|
||||||
{
|
{
|
||||||
DistinctQueryBuilder {
|
DistinctQueryBuilder {
|
||||||
@ -81,19 +102,19 @@ where D: Deref<Target=DB>,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn query_all(&self, query: &str) -> Vec<Document> {
|
fn query_all(&self, query: &str) -> Vec<RawDocument> {
|
||||||
let automatons = split_whitespace_automatons(query);
|
let automatons = split_whitespace_automatons(query);
|
||||||
|
|
||||||
let mut stream = {
|
let mut stream = {
|
||||||
let mut op_builder = fst::map::OpBuilder::new();
|
let mut op_builder = fst::map::OpBuilder::new();
|
||||||
for automaton in &automatons {
|
for automaton in &automatons {
|
||||||
let stream = self.view.index().positive.map().search(automaton);
|
let stream = self.index.map.search(automaton);
|
||||||
op_builder.push(stream);
|
op_builder.push(stream);
|
||||||
}
|
}
|
||||||
op_builder.union()
|
op_builder.union()
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut matches = HashMap::new();
|
let mut matches = Vec::new();
|
||||||
|
|
||||||
while let Some((input, indexed_values)) = stream.next() {
|
while let Some((input, indexed_values)) = stream.next() {
|
||||||
for iv in indexed_values {
|
for iv in indexed_values {
|
||||||
@ -101,7 +122,7 @@ where D: Deref<Target=DB>,
|
|||||||
let distance = automaton.eval(input).to_u8();
|
let distance = automaton.eval(input).to_u8();
|
||||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||||
|
|
||||||
let doc_indexes = &self.view.index().positive.indexes();
|
let doc_indexes = &self.index.indexes;
|
||||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||||
|
|
||||||
for doc_index in doc_indexes {
|
for doc_index in doc_indexes {
|
||||||
@ -109,41 +130,50 @@ where D: Deref<Target=DB>,
|
|||||||
query_index: iv.index as u32,
|
query_index: iv.index as u32,
|
||||||
distance: distance,
|
distance: distance,
|
||||||
attribute: doc_index.attribute,
|
attribute: doc_index.attribute,
|
||||||
|
word_index: doc_index.word_index,
|
||||||
is_exact: is_exact,
|
is_exact: is_exact,
|
||||||
word_area: doc_index.word_area,
|
char_index: doc_index.char_index,
|
||||||
|
char_length: doc_index.char_length,
|
||||||
};
|
};
|
||||||
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
|
matches.push((doc_index.document_id, match_));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("{} documents to classify", matches.len());
|
let total_matches = matches.len();
|
||||||
|
let raw_documents = raw_documents_from_matches(matches);
|
||||||
|
|
||||||
matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
|
info!("{} total documents to classify", raw_documents.len());
|
||||||
|
info!("{} total matches to classify", total_matches);
|
||||||
|
|
||||||
|
raw_documents
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
|
||||||
where D: Deref<Target=DB>,
|
where FI: Fn(DocumentId) -> bool,
|
||||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
|
||||||
{
|
{
|
||||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||||
// We give the filtering work to the query distinct builder,
|
// We delegate the filter work to the distinct query builder,
|
||||||
// specifying a distinct rule that has no effect.
|
// specifying a distinct rule that has no effect.
|
||||||
if self.filter.is_some() {
|
if self.filter.is_some() {
|
||||||
let builder = self.with_distinct(|_, _| None as Option<()>, 1);
|
let builder = self.with_distinct(|_| None as Option<()>, 1);
|
||||||
return builder.query(query, range);
|
return builder.query(query, range);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
let mut documents = self.query_all(query);
|
let mut documents = self.query_all(query);
|
||||||
let mut groups = vec![documents.as_mut_slice()];
|
info!("query_all took {:.2?}", start.elapsed());
|
||||||
let view = &self.view;
|
|
||||||
|
|
||||||
'criteria: for criterion in self.criteria.as_ref() {
|
let mut groups = vec![documents.as_mut_slice()];
|
||||||
|
|
||||||
|
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
|
||||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||||
let mut documents_seen = 0;
|
let mut documents_seen = 0;
|
||||||
|
|
||||||
for group in tmp_groups {
|
for group in tmp_groups {
|
||||||
|
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||||
|
|
||||||
// if this group does not overlap with the requested range,
|
// if this group does not overlap with the requested range,
|
||||||
// push it without sorting and splitting it
|
// push it without sorting and splitting it
|
||||||
if documents_seen + group.len() < range.start {
|
if documents_seen + group.len() < range.start {
|
||||||
@ -152,9 +182,11 @@ where D: Deref<Target=DB>,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
let start = Instant::now();
|
||||||
|
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||||
|
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
|
||||||
|
|
||||||
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||||
documents_seen += group.len();
|
documents_seen += group.len();
|
||||||
groups.push(group);
|
groups.push(group);
|
||||||
|
|
||||||
@ -165,28 +197,22 @@ where D: Deref<Target=DB>,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// `drain` removes the documents efficiently using `ptr::copy`
|
|
||||||
// TODO it could be more efficient to have a custom iterator
|
|
||||||
let offset = cmp::min(documents.len(), range.start);
|
let offset = cmp::min(documents.len(), range.start);
|
||||||
documents.drain(0..offset);
|
let iter = documents.into_iter().skip(offset).take(range.len());
|
||||||
documents.truncate(range.len());
|
iter.map(|d| Document::from_raw(&d)).collect()
|
||||||
documents
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct DistinctQueryBuilder<'a, D, FI, FD>
|
pub struct DistinctQueryBuilder<'i, 'c, FI, FD> {
|
||||||
where D: Deref<Target=DB>
|
inner: QueryBuilder<'i, 'c, FI>,
|
||||||
{
|
|
||||||
inner: QueryBuilder<'a, D, FI>,
|
|
||||||
function: FD,
|
function: FD,
|
||||||
size: usize,
|
size: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
|
impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD>
|
||||||
where D: Deref<Target=DB>,
|
|
||||||
{
|
{
|
||||||
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
|
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD>
|
||||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
where F: Fn(DocumentId) -> bool,
|
||||||
{
|
{
|
||||||
DistinctQueryBuilder {
|
DistinctQueryBuilder {
|
||||||
inner: self.inner.with_filter(function),
|
inner: self.inner.with_filter(function),
|
||||||
@ -196,17 +222,18 @@ where D: Deref<Target=DB>,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
|
impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD>
|
||||||
where D: Deref<Target=DB>,
|
where FI: Fn(DocumentId) -> bool,
|
||||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
FD: Fn(DocumentId) -> Option<K>,
|
||||||
FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
|
||||||
K: Hash + Eq,
|
K: Hash + Eq,
|
||||||
{
|
{
|
||||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||||
|
let start = Instant::now();
|
||||||
let mut documents = self.inner.query_all(query);
|
let mut documents = self.inner.query_all(query);
|
||||||
|
info!("query_all took {:.2?}", start.elapsed());
|
||||||
|
|
||||||
let mut groups = vec![documents.as_mut_slice()];
|
let mut groups = vec![documents.as_mut_slice()];
|
||||||
let mut key_cache = HashMap::new();
|
let mut key_cache = HashMap::new();
|
||||||
let view = &self.inner.view;
|
|
||||||
|
|
||||||
let mut filter_map = HashMap::new();
|
let mut filter_map = HashMap::new();
|
||||||
// these two variables informs on the current distinct map and
|
// these two variables informs on the current distinct map and
|
||||||
@ -215,12 +242,14 @@ where D: Deref<Target=DB>,
|
|||||||
let mut distinct_map = DistinctMap::new(self.size);
|
let mut distinct_map = DistinctMap::new(self.size);
|
||||||
let mut distinct_raw_offset = 0;
|
let mut distinct_raw_offset = 0;
|
||||||
|
|
||||||
'criteria: for criterion in self.inner.criteria.as_ref() {
|
'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
|
||||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
||||||
let mut documents_seen = 0;
|
let mut documents_seen = 0;
|
||||||
|
|
||||||
for group in tmp_groups {
|
for group in tmp_groups {
|
||||||
|
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||||
|
|
||||||
// if this group does not overlap with the requested range,
|
// if this group does not overlap with the requested range,
|
||||||
// push it without sorting and splitting it
|
// push it without sorting and splitting it
|
||||||
if documents_seen + group.len() < distinct_raw_offset {
|
if documents_seen + group.len() < distinct_raw_offset {
|
||||||
@ -229,22 +258,24 @@ where D: Deref<Target=DB>,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
let start = Instant::now();
|
||||||
|
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||||
|
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
|
||||||
|
|
||||||
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||||
// we must compute the real distinguished len of this sub-group
|
// we must compute the real distinguished len of this sub-group
|
||||||
for document in group.iter() {
|
for document in group.iter() {
|
||||||
let filter_accepted = match &self.inner.filter {
|
let filter_accepted = match &self.inner.filter {
|
||||||
Some(filter) => {
|
Some(filter) => {
|
||||||
let entry = filter_map.entry(document.id);
|
let entry = filter_map.entry(document.id);
|
||||||
*entry.or_insert_with(|| (filter)(document.id, view))
|
*entry.or_insert_with(|| (filter)(document.id))
|
||||||
},
|
},
|
||||||
None => true,
|
None => true,
|
||||||
};
|
};
|
||||||
|
|
||||||
if filter_accepted {
|
if filter_accepted {
|
||||||
let entry = key_cache.entry(document.id);
|
let entry = key_cache.entry(document.id);
|
||||||
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
|
let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new));
|
||||||
|
|
||||||
match key.clone() {
|
match key.clone() {
|
||||||
Some(key) => buf_distinct.register(key),
|
Some(key) => buf_distinct.register(key),
|
||||||
@ -290,7 +321,7 @@ where D: Deref<Target=DB>,
|
|||||||
};
|
};
|
||||||
|
|
||||||
if distinct_accepted && seen.len() > range.start {
|
if distinct_accepted && seen.len() > range.start {
|
||||||
out_documents.push(document);
|
out_documents.push(Document::from_raw(&document));
|
||||||
if out_documents.len() == range.len() { break }
|
if out_documents.len() == range.len() { break }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
56
src/shared_data_cursor.rs
Normal file
56
src/shared_data_cursor.rs
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
use std::io::{self, Read, Cursor, BufRead};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use crate::data::SharedData;
|
||||||
|
|
||||||
|
pub struct SharedDataCursor(Cursor<SharedData>);
|
||||||
|
|
||||||
|
impl SharedDataCursor {
|
||||||
|
pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor {
|
||||||
|
let len = bytes.len();
|
||||||
|
let bytes = Arc::new(bytes);
|
||||||
|
|
||||||
|
SharedDataCursor::from_shared_bytes(bytes, 0, len)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor {
|
||||||
|
let data = SharedData::new(bytes, offset, len);
|
||||||
|
let cursor = Cursor::new(data);
|
||||||
|
|
||||||
|
SharedDataCursor(cursor)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn extract(&mut self, amt: usize) -> SharedData {
|
||||||
|
let offset = self.0.position() as usize;
|
||||||
|
let extracted = self.0.get_ref().range(offset, amt);
|
||||||
|
self.0.consume(amt);
|
||||||
|
|
||||||
|
extracted
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Read for SharedDataCursor {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||||
|
self.0.read(buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BufRead for SharedDataCursor {
|
||||||
|
fn fill_buf(&mut self) -> io::Result<&[u8]> {
|
||||||
|
self.0.fill_buf()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn consume(&mut self, amt: usize) {
|
||||||
|
self.0.consume(amt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait FromSharedDataCursor: Sized {
|
||||||
|
type Error;
|
||||||
|
|
||||||
|
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error>;
|
||||||
|
|
||||||
|
fn from_bytes(bytes: Vec<u8>) -> Result<Self, Self::Error> {
|
||||||
|
let mut cursor = SharedDataCursor::from_bytes(bytes);
|
||||||
|
Self::from_shared_data_cursor(&mut cursor)
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,5 @@
|
|||||||
use std::mem;
|
use std::mem;
|
||||||
|
use crate::is_cjk;
|
||||||
use self::Separator::*;
|
use self::Separator::*;
|
||||||
|
|
||||||
pub trait TokenizerBuilder {
|
pub trait TokenizerBuilder {
|
||||||
@ -75,7 +76,7 @@ impl Separator {
|
|||||||
|
|
||||||
fn detect_separator(c: char) -> Option<Separator> {
|
fn detect_separator(c: char) -> Option<Separator> {
|
||||||
match c {
|
match c {
|
||||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long),
|
||||||
' ' | '\'' | '"' => Some(Short),
|
' ' | '\'' | '"' => Some(Short),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
@ -109,9 +110,58 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
return Some(token)
|
return Some(token)
|
||||||
}
|
}
|
||||||
|
|
||||||
distance.replace(distance.map_or(sep, |s| s.add(sep)));
|
distance = Some(distance.map_or(sep, |s| s.add(sep)));
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
// if this is a Chinese, a Japanese or a Korean character
|
||||||
|
// See <http://unicode-table.com>
|
||||||
|
if is_cjk(c) {
|
||||||
|
match start_word {
|
||||||
|
Some(start_word) => {
|
||||||
|
let (prefix, tail) = self.inner.split_at(i);
|
||||||
|
let (spaces, word) = prefix.split_at(start_word);
|
||||||
|
|
||||||
|
self.inner = tail;
|
||||||
|
self.char_index += spaces.chars().count();
|
||||||
|
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||||
|
|
||||||
|
let token = Token {
|
||||||
|
word: word,
|
||||||
|
word_index: self.word_index,
|
||||||
|
char_index: self.char_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.word_index += 1;
|
||||||
|
self.char_index += word.chars().count();
|
||||||
|
|
||||||
|
return Some(token)
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
|
||||||
|
let (spaces, word) = prefix.split_at(i);
|
||||||
|
|
||||||
|
self.inner = tail;
|
||||||
|
self.char_index += spaces.chars().count();
|
||||||
|
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||||
|
|
||||||
|
let token = Token {
|
||||||
|
word: word,
|
||||||
|
word_index: self.word_index,
|
||||||
|
char_index: self.char_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
if tail.chars().next().and_then(detect_separator).is_none() {
|
||||||
|
self.word_index += 1;
|
||||||
|
}
|
||||||
|
self.char_index += 1;
|
||||||
|
|
||||||
|
return Some(token)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if start_word.is_none() { start_word = Some(i) }
|
||||||
},
|
},
|
||||||
None => { start_word.get_or_insert(i); },
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,11 +200,12 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn hard() {
|
fn hard() {
|
||||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
|
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||||
@ -185,4 +236,24 @@ mod tests {
|
|||||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hard_kanjis() {
|
||||||
|
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
|
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
102
src/word_area.rs
102
src/word_area.rs
@ -1,102 +0,0 @@
|
|||||||
use std::fmt;
|
|
||||||
|
|
||||||
/// Represent a word position in bytes along with the length of it.
|
|
||||||
///
|
|
||||||
/// It can represent words byte index to maximum 2^22 and
|
|
||||||
/// up to words of length 1024.
|
|
||||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
||||||
pub struct WordArea(u32);
|
|
||||||
|
|
||||||
impl WordArea {
|
|
||||||
/// Construct a `WordArea` from a word position in expresed as
|
|
||||||
/// a number of characters and the length of it.
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// The char index must not be greater than 2^22
|
|
||||||
/// and the length not greater than 1024.
|
|
||||||
pub(crate) fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
|
|
||||||
if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
|
||||||
return Err(WordAreaError::ByteIndexTooBig)
|
|
||||||
}
|
|
||||||
|
|
||||||
if length & 0b1111_1100_0000_0000 != 0 {
|
|
||||||
return Err(WordAreaError::LengthTooBig)
|
|
||||||
}
|
|
||||||
|
|
||||||
let char_index = char_index << 10;
|
|
||||||
Ok(WordArea(char_index | u32::from(length)))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea {
|
|
||||||
match WordArea::new(char_index, length) {
|
|
||||||
Ok(word_area) => word_area,
|
|
||||||
Err(WordAreaError::ByteIndexTooBig) => {
|
|
||||||
panic!("word area byte index must not be greater than 2^22")
|
|
||||||
},
|
|
||||||
Err(WordAreaError::LengthTooBig) => {
|
|
||||||
panic!("word area length must not be greater than 1024")
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn max_value() -> WordArea {
|
|
||||||
WordArea(u32::max_value())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn char_index(self) -> u32 {
|
|
||||||
self.0 >> 10
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn length(self) -> u16 {
|
|
||||||
(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for WordArea {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
f.debug_struct("WordArea")
|
|
||||||
.field("char_index", &self.char_index())
|
|
||||||
.field("length", &self.length())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum WordAreaError {
|
|
||||||
ByteIndexTooBig,
|
|
||||||
LengthTooBig,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use quickcheck::{quickcheck, TestResult};
|
|
||||||
|
|
||||||
quickcheck! {
|
|
||||||
fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult {
|
|
||||||
if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
|
|
||||||
return TestResult::discard()
|
|
||||||
}
|
|
||||||
|
|
||||||
let word_area = WordArea::new_faillible(gen_char_index, gen_length);
|
|
||||||
|
|
||||||
let valid_char_index = word_area.char_index() == gen_char_index;
|
|
||||||
let valid_length = word_area.length() == gen_length;
|
|
||||||
|
|
||||||
TestResult::from_bool(valid_char_index && valid_length)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult {
|
|
||||||
if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
|
|
||||||
return TestResult::discard()
|
|
||||||
}
|
|
||||||
|
|
||||||
let a = WordArea::new_faillible(gen_char_index, gen_length);
|
|
||||||
let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1);
|
|
||||||
|
|
||||||
TestResult::from_bool(a < b)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
9
src/write_to_bytes.rs
Normal file
9
src/write_to_bytes.rs
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
pub trait WriteToBytes {
|
||||||
|
fn write_to_bytes(&self, bytes: &mut Vec<u8>);
|
||||||
|
|
||||||
|
fn into_bytes(&self) -> Vec<u8> {
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
self.write_to_bytes(&mut bytes);
|
||||||
|
bytes
|
||||||
|
}
|
||||||
|
}
|
59
typos-ranking-rules.md
Normal file
59
typos-ranking-rules.md
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
# Typo and Ranking rules
|
||||||
|
|
||||||
|
This is an explanation of the default rules used in MeiliDB.
|
||||||
|
|
||||||
|
First we have to explain some terms that are used in this reading.
|
||||||
|
|
||||||
|
- A query string is the full list of all the words that the end user is searching for results.
|
||||||
|
- A query word is one of the words that compose the query string.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Typo rules
|
||||||
|
|
||||||
|
The typo rules are used before sorting the documents. They are used to aggregate them, to choose which documents contain words similar to the queried words.
|
||||||
|
|
||||||
|
We use a prefix _Levenshtein_ algorithm to check if the words match. The only difference with a Levenshtein algorithm is that it accepts every word that **starts with the query words** too. Therefore words are accepted if they start with or have the equal length.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
The Levenshtein distance between two words _M_ and _P_ is called "the minimum cost of transforming _M_ into _P_" by performing the following elementary operations:
|
||||||
|
|
||||||
|
- substitution of a character of _M_ by a character other than _P_. (e.g. **k**itten → **s**itten)
|
||||||
|
- insertion in _M_ of a character of _P_. (e.g. sittin → sittin**g**)
|
||||||
|
- deleting a character from _M_. (e.g. satu**r**day → satuday)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
There are some rules about what can be considered "similar". These rules are **by word** and not for the whole query string.
|
||||||
|
|
||||||
|
- If the query word is between 1 and 4 characters long therefore **no** typo is allowed, only documents that contains words that start or are exactly equal to this query word are considered valid for this request.
|
||||||
|
- If the query word is between 5 and 8 characters long, **one** typo is allowed. Documents that contains words that match with one typo are retained for the next steps.
|
||||||
|
- If the query word contains more than 8 characters, we accept a maximum of **two** typos.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
This means that "satuday", which is 7 characters long, use the second rule and every document containing words that have only **one** typo will match. For example:
|
||||||
|
|
||||||
|
- "satuday" is accepted because it is exactly the same word.
|
||||||
|
- "sat" is not accepted because the query word is not a prefix of it but the opposite.
|
||||||
|
- "satu**r**day" is accepted because it contains **one** typo.
|
||||||
|
- "s**u**tu**r**day" is not accepted because it contains **two** typos.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Ranking rules
|
||||||
|
|
||||||
|
All documents that have been aggregated using the typo rules above can now be sorted. MeiliDB uses a bucket sort.
|
||||||
|
|
||||||
|
What is a bucket sort? We sort all the documents with the first rule, for all documents that can't be separated we create a group and sort it using the second rule, and so on.
|
||||||
|
|
||||||
|
Here is the list of all the default rules that are executed in this specific order by default:
|
||||||
|
|
||||||
|
- _Number of Typos_ - The less typos there are beween the query words and the document words, the better is the document.
|
||||||
|
- _Number of Words_ - A document containing more of the query words will be more important than one that contains less.
|
||||||
|
- _Words Proximity_ - The closer the query words are in the document the better is the document.
|
||||||
|
- _Attribute_ - A document containing the query words in a more important attribute than another document is considered better.
|
||||||
|
- _Position_ - A document containing the query words at the start of an attribute is considered better than a document that contains them at the end.
|
||||||
|
- _Exact_ - A document containing the query words in their exact form, not only a prefix of them, is considered better.
|
||||||
|
|
Reference in New Issue
Block a user