mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-19 04:50:37 +00:00
Compare commits
111 Commits
Author | SHA1 | Date | |
---|---|---|---|
aef7d7825f | |||
f28ce661af | |||
74eb9c8d0f | |||
d664221c64 | |||
58bff3d4ac | |||
2c206eb98c | |||
19724e5af9 | |||
c9e0ad132c | |||
24f265a963 | |||
f8a743ee00 | |||
64971de7ed | |||
a960c325f3 | |||
a799470997 | |||
10414791a2 | |||
743974e60d | |||
0e267cae4b | |||
12a352ae2f | |||
5070b27728 | |||
7a6b734078 | |||
24823da6f7 | |||
8701cb3a8f | |||
315fc1fbe3 | |||
23833bac10 | |||
8235b6efc9 | |||
7f937eea5a | |||
a1cf634ac1 | |||
c86472e997 | |||
26cb398a6f | |||
f6e664d298 | |||
9437cecf87 | |||
13309511b3 | |||
1941cb16c0 | |||
55823c5d5d | |||
4721da1679 | |||
482f750231 | |||
d5119db165 | |||
37578ed74f | |||
f5992ce822 | |||
badb0035c5 | |||
4bc14aa261 | |||
a0c4ec0be0 | |||
264fffa826 | |||
bddb37e44f | |||
6393b0cbc0 | |||
a8df438814 | |||
8014857ebf | |||
9e7261a48f | |||
c4e70d0475 | |||
cbb0aaa217 | |||
ce50e74491 | |||
e103e1c277 | |||
64929fe5dc | |||
b108f1e6c9 | |||
58b417e045 | |||
2e5a616d8e | |||
092d446a7e | |||
85a1f126bf | |||
cf58cf86da | |||
db6210c7ee | |||
83cd071827 | |||
084c3a95b6 | |||
78908aa34e | |||
cf27706f91 | |||
d3f53a7fd6 | |||
508af5613f | |||
c615c31016 | |||
908b28790b | |||
4c0279729b | |||
96dfac5b33 | |||
8576218b51 | |||
1c1f9201b8 | |||
4398b88a3a | |||
73e79f5ca4 | |||
1bfd51d6e9 | |||
0d2daf27f2 | |||
87f0d8cf3c | |||
06d5a10902 | |||
94b89c5439 | |||
c5e951be09 | |||
66ae5c8161 | |||
8438e2202f | |||
7a6166d229 | |||
d46fa4b215 | |||
2bd5b4ab86 | |||
5efbc5ceb3 | |||
2e905bac08 | |||
4c0ad5f964 | |||
455cbf3bf4 | |||
a3a28c56fa | |||
b0b3175641 | |||
c2f0df3f73 | |||
820f1f9ac6 | |||
337aee5b65 | |||
810dfdf656 | |||
f016652fca | |||
6c99ebe3fa | |||
94d357985f | |||
fbc698567a | |||
aa9db14c09 | |||
61e83a1c21 | |||
1316be5b09 | |||
4e8b0383dd | |||
4fa10753c1 | |||
2473e289e8 | |||
e0e5e87ed3 | |||
b13e61f40a | |||
c023cb3065 | |||
0a3d069fbc | |||
fa062ce2cf | |||
cdc6e47bf5 | |||
d5f44838be |
@ -11,8 +11,8 @@ matrix:
|
||||
include:
|
||||
|
||||
# Test crates on their minimum Rust versions.
|
||||
- rust: 1.31.0
|
||||
name: "meilidb on 1.31.0"
|
||||
- rust: 1.32.0
|
||||
name: "meilidb on 1.32.0"
|
||||
script: ./ci/meilidb.sh
|
||||
|
||||
# Test crates on nightly Rust.
|
||||
|
58
Cargo.toml
58
Cargo.toml
@ -1,23 +1,28 @@
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "meilidb"
|
||||
version = "0.2.0"
|
||||
version = "0.3.2"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
||||
[dependencies]
|
||||
bincode = "1.0"
|
||||
byteorder = "1.2"
|
||||
crossbeam = "0.6"
|
||||
fst = "0.3"
|
||||
hashbrown = { version = "0.1", features = ["serde"] }
|
||||
lazy_static = "1.1"
|
||||
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
|
||||
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
|
||||
log = "0.4"
|
||||
sdset = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
unidecode = "0.3"
|
||||
arc-swap = "0.3.7"
|
||||
bincode = "1.1.2"
|
||||
byteorder = "1.3.1"
|
||||
fst = "0.3.3"
|
||||
hashbrown = { version = "0.1.8", features = ["serde"] }
|
||||
lazy_static = "1.2.0"
|
||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
||||
lockfree = "0.5.1"
|
||||
log = "0.4.6"
|
||||
rayon = "1.0.3"
|
||||
sdset = "0.3.1"
|
||||
serde = "1.0.88"
|
||||
serde_derive = "1.0.88"
|
||||
serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
||||
size_format = "1.0.2"
|
||||
slice-group-by = "0.2.4"
|
||||
unidecode = "0.3.0"
|
||||
|
||||
[dependencies.toml]
|
||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||
@ -28,28 +33,23 @@ rev = "0372ba6"
|
||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||
rev = "306e201"
|
||||
|
||||
[dependencies.group-by]
|
||||
git = "https://github.com/Kerollmops/group-by.git"
|
||||
rev = "5a113fe"
|
||||
|
||||
[features]
|
||||
default = ["simd"]
|
||||
i128 = ["bincode/i128", "byteorder/i128"]
|
||||
portable = ["rocksdb/portable"]
|
||||
simd = ["rocksdb/sse"]
|
||||
nightly = ["hashbrown/nightly", "group-by/nightly"]
|
||||
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
||||
|
||||
[dev-dependencies]
|
||||
csv = "1.0"
|
||||
elapsed = "0.1"
|
||||
env_logger = "0.6"
|
||||
jemallocator = "0.1"
|
||||
quickcheck = "0.8"
|
||||
rand = "0.6"
|
||||
rand_xorshift = "0.1"
|
||||
structopt = "0.2"
|
||||
tempfile = "3.0"
|
||||
termcolor = "1.0"
|
||||
csv = "1.0.5"
|
||||
env_logger = "0.6.0"
|
||||
jemallocator = "0.1.9"
|
||||
quickcheck = "0.8.2"
|
||||
rand = "0.6.5"
|
||||
rand_xorshift = "0.1.1"
|
||||
structopt = "0.2.14"
|
||||
tempfile = "3.0.7"
|
||||
termcolor = "1.0.4"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
|
22
README.md
22
README.md
@ -10,7 +10,7 @@ A _full-text search database_ using a key-value store internally.
|
||||
|
||||
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
|
||||
|
||||
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.
|
||||
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries or you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents.
|
||||
|
||||
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
|
||||
|
||||
@ -22,20 +22,20 @@ MeiliDB will be a binary in a near future so you will be able to use it as a dat
|
||||
|
||||
## Performances
|
||||
|
||||
With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
|
||||
So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.
|
||||
With a database composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
|
||||
So more than _300 000_ fields indexed for _35 million_ stored we can handle more than _2.8k req/sec_ with an average response time of _9 ms_ on an Intel i7-7700 (8) @ 4.2GHz.
|
||||
|
||||
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.
|
||||
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to simulate real users queries.
|
||||
|
||||
```
|
||||
Running 10s test @ http://localhost:2230
|
||||
2 threads and 12 connections
|
||||
2 threads and 25 connections
|
||||
Thread Stats Avg Stdev Max +/- Stdev
|
||||
Latency 18.86ms 49.39ms 614.89ms 95.23%
|
||||
Req/Sec 620.41 59.53 790.00 65.00%
|
||||
12359 requests in 10.00s, 3.26MB read
|
||||
Requests/sec: 1235.54
|
||||
Transfer/sec: 334.22KB
|
||||
Latency 9.52ms 7.61ms 99.25ms 84.58%
|
||||
Req/Sec 1.41k 119.11 1.78k 64.50%
|
||||
28080 requests in 10.01s, 7.42MB read
|
||||
Requests/sec: 2806.46
|
||||
Transfer/sec: 759.17KB
|
||||
```
|
||||
|
||||
### Notes
|
||||
@ -49,7 +49,7 @@ MeiliDB runs with an index like most search engines.
|
||||
So to test the library you can create one by indexing a simple csv file.
|
||||
|
||||
```bash
|
||||
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml --stop-words misc/fr.stopwords.txt
|
||||
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
|
||||
```
|
||||
|
||||
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
|
||||
|
@ -1,17 +1,18 @@
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
use std::error::Error;
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use meilidb::database::{Database, Schema, UpdateBuilder};
|
||||
use meilidb::database::{Database, Schema};
|
||||
use meilidb::tokenizer::DefaultBuilder;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
@ -50,7 +51,9 @@ fn index(
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<Database, Box<Error>>
|
||||
{
|
||||
let database = Database::create(database_path, &schema)?;
|
||||
let database = Database::create(database_path)?;
|
||||
|
||||
database.create_index("default", &schema)?;
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
@ -61,8 +64,7 @@ fn index(
|
||||
|
||||
while !end_of_file {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = tempfile::NamedTempFile::new()?;
|
||||
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
|
||||
let mut update = database.start_update("default")?;
|
||||
|
||||
loop {
|
||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||
@ -88,10 +90,8 @@ fn index(
|
||||
|
||||
println!();
|
||||
|
||||
println!("building update...");
|
||||
let update = update.build()?;
|
||||
println!("ingesting update...");
|
||||
database.ingest_update_file(update)?;
|
||||
println!("committing update...");
|
||||
database.commit_update(update)?;
|
||||
}
|
||||
|
||||
Ok(database)
|
||||
@ -125,14 +125,13 @@ fn main() -> Result<(), Box<Error>> {
|
||||
None => HashSet::new(),
|
||||
};
|
||||
|
||||
let (elapsed, result) = elapsed::measure_time(|| {
|
||||
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
|
||||
});
|
||||
let start = Instant::now();
|
||||
let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words);
|
||||
|
||||
if let Err(e) = result {
|
||||
return Err(e.into())
|
||||
}
|
||||
|
||||
println!("database created in {} at: {:?}", elapsed, opt.database_path);
|
||||
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
|
||||
Ok(())
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::iter::FromIterator;
|
||||
use std::io::{self, Write};
|
||||
use std::time::Instant;
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
@ -27,6 +28,10 @@ pub struct Opt {
|
||||
/// The number of returned results
|
||||
#[structopt(short = "n", long = "number-results", default_value = "10")]
|
||||
pub number_results: usize,
|
||||
|
||||
/// The number of characters before and after the first match
|
||||
#[structopt(short = "C", long = "context", default_value = "35")]
|
||||
pub char_context: usize,
|
||||
}
|
||||
|
||||
type Document = HashMap<String, String>;
|
||||
@ -66,26 +71,21 @@ fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize)
|
||||
(byte_index, byte_length)
|
||||
}
|
||||
|
||||
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
|
||||
fn create_highlight_areas(text: &str, matches: &[Match]) -> Vec<usize> {
|
||||
let mut byte_indexes = BTreeMap::new();
|
||||
|
||||
for match_ in matches {
|
||||
let match_attribute = match_.attribute.attribute();
|
||||
if SchemaAttr::new(match_attribute) == attribute {
|
||||
let word_area = match_.word_area;
|
||||
let char_index = match_.char_index as usize;
|
||||
let char_length = match_.char_length as usize;
|
||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||
|
||||
let char_index = word_area.char_index() as usize;
|
||||
let char_length = word_area.length() as usize;
|
||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||
|
||||
match byte_indexes.entry(byte_index) {
|
||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||
Entry::Occupied(mut entry) => {
|
||||
if *entry.get() < byte_length {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
},
|
||||
}
|
||||
match byte_indexes.entry(byte_index) {
|
||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||
Entry::Occupied(mut entry) => {
|
||||
if *entry.get() < byte_length {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -100,13 +100,46 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
|
||||
title_areas
|
||||
}
|
||||
|
||||
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
|
||||
///
|
||||
/// ```no_run
|
||||
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
///
|
||||
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
|
||||
///
|
||||
/// let (text, matches) = crop_text(&text, matches, 35);
|
||||
/// ```
|
||||
fn crop_text(
|
||||
text: &str,
|
||||
matches: impl IntoIterator<Item=Match>,
|
||||
context: usize,
|
||||
) -> (String, Vec<Match>)
|
||||
{
|
||||
let mut matches = matches.into_iter().peekable();
|
||||
|
||||
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
|
||||
let start = char_index.saturating_sub(context);
|
||||
let text = text.chars().skip(start).take(context * 2).collect();
|
||||
|
||||
let matches = matches
|
||||
.take_while(|m| {
|
||||
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
||||
})
|
||||
.map(|match_| {
|
||||
Match { char_index: match_.char_index - start as u32, ..match_ }
|
||||
})
|
||||
.collect();
|
||||
|
||||
(text, matches)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
|
||||
let database = result?;
|
||||
println!("database prepared for you in {}", elapsed);
|
||||
let start = Instant::now();
|
||||
let database = Database::open(&opt.database_path)?;
|
||||
println!("database prepared for you in {:.2?}", start.elapsed());
|
||||
|
||||
let mut buffer = String::new();
|
||||
let input = io::stdin();
|
||||
@ -118,16 +151,19 @@ fn main() -> Result<(), Box<Error>> {
|
||||
if input.read_line(&mut buffer)? == 0 { break }
|
||||
let query = buffer.trim_end_matches('\n');
|
||||
|
||||
let view = database.view();
|
||||
let view = database.view("default")?;
|
||||
let schema = view.schema();
|
||||
|
||||
let (elapsed, documents) = elapsed::measure_time(|| {
|
||||
let builder = view.query_builder().unwrap();
|
||||
builder.query(query, 0..opt.number_results)
|
||||
});
|
||||
let start = Instant::now();
|
||||
|
||||
let builder = view.query_builder();
|
||||
let documents = builder.query(query, 0..opt.number_results);
|
||||
|
||||
let number_of_documents = documents.len();
|
||||
for doc in documents {
|
||||
for mut doc in documents {
|
||||
|
||||
doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index));
|
||||
|
||||
match view.document_by_id::<Document>(doc.id) {
|
||||
Ok(document) => {
|
||||
for name in &opt.displayed_fields {
|
||||
@ -141,7 +177,11 @@ fn main() -> Result<(), Box<Error>> {
|
||||
};
|
||||
|
||||
print!("{}: ", name);
|
||||
let areas = create_highlight_areas(&text, &doc.matches, attr);
|
||||
let matches = doc.matches.iter()
|
||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||
.cloned();
|
||||
let (text, matches) = crop_text(&text, matches, opt.char_context);
|
||||
let areas = create_highlight_areas(&text, &matches);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
}
|
||||
@ -151,7 +191,7 @@ fn main() -> Result<(), Box<Error>> {
|
||||
|
||||
let mut matching_attributes = HashSet::new();
|
||||
for _match in doc.matches {
|
||||
let attr = SchemaAttr::new(_match.attribute.attribute());
|
||||
let attr = SchemaAttr::new(_match.attribute);
|
||||
let name = schema.attribute_name(attr);
|
||||
matching_attributes.insert(name);
|
||||
}
|
||||
@ -162,7 +202,7 @@ fn main() -> Result<(), Box<Error>> {
|
||||
println!();
|
||||
}
|
||||
|
||||
eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
|
||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed());
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
|
105
src/attribute.rs
105
src/attribute.rs
@ -1,105 +0,0 @@
|
||||
use std::fmt;
|
||||
|
||||
/// Represent an attribute number along with the word index
|
||||
/// according to the tokenizer used.
|
||||
///
|
||||
/// It can accept up to 1024 attributes and word positions
|
||||
/// can be maximum 2^22.
|
||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Attribute(u32);
|
||||
|
||||
impl Attribute {
|
||||
/// Construct an `Attribute` from an attribute number and
|
||||
/// the word position of a match according to the tokenizer used.
|
||||
pub(crate) fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
|
||||
if attribute & 0b1111_1100_0000_0000 != 0 {
|
||||
return Err(AttributeError::AttributeTooBig)
|
||||
}
|
||||
|
||||
if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
||||
return Err(AttributeError::IndexTooBig)
|
||||
}
|
||||
|
||||
let attribute = u32::from(attribute) << 22;
|
||||
Ok(Attribute(attribute | index))
|
||||
}
|
||||
|
||||
/// Construct an `Attribute` from an attribute number and
|
||||
/// the word position of a match according to the tokenizer used.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The attribute must not be greater than 1024
|
||||
/// and the word index not greater than 2^22.
|
||||
pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute {
|
||||
match Attribute::new(attribute, index) {
|
||||
Ok(attribute) => attribute,
|
||||
Err(AttributeError::AttributeTooBig) => {
|
||||
panic!("attribute must not be greater than 1024")
|
||||
},
|
||||
Err(AttributeError::IndexTooBig) => {
|
||||
panic!("attribute word index must not be greater than 2^22")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn max_value() -> Attribute {
|
||||
Attribute(u32::max_value())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn attribute(self) -> u16 {
|
||||
(self.0 >> 22) as u16
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn word_index(self) -> u32 {
|
||||
self.0 & 0b0000_0000_0011_1111_1111_1111_1111
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Attribute {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("Attribute")
|
||||
.field("attribute", &self.attribute())
|
||||
.field("word_index", &self.word_index())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum AttributeError {
|
||||
AttributeTooBig,
|
||||
IndexTooBig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use quickcheck::{quickcheck, TestResult};
|
||||
|
||||
quickcheck! {
|
||||
fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
|
||||
if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let attribute = Attribute::new_faillible(gen_attr, gen_index);
|
||||
|
||||
let valid_attribute = attribute.attribute() == gen_attr;
|
||||
let valid_index = attribute.word_index() == gen_index;
|
||||
|
||||
TestResult::from_bool(valid_attribute && valid_index)
|
||||
}
|
||||
|
||||
fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
|
||||
if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let a = Attribute::new_faillible(gen_attr, gen_index);
|
||||
let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
|
||||
|
||||
TestResult::from_bool(a < b)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,12 +1,15 @@
|
||||
use std::io::{self, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::SharedData;
|
||||
use crate::DocumentId;
|
||||
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
@ -19,21 +22,6 @@ impl DocIds {
|
||||
DocIds(data)
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let doc_ids = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIds(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
@ -52,3 +40,22 @@ impl AsRef<Set<DocumentId>> for DocIds {
|
||||
Set::new_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for DocIds {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let data = cursor.extract(len);
|
||||
|
||||
Ok(DocIds(data))
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for DocIds {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,16 @@
|
||||
use std::io::{self, Write, Cursor, BufRead};
|
||||
use std::io::{self, Write};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Debug)]
|
||||
@ -25,38 +27,6 @@ pub struct DocIndexes {
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
|
||||
let bytes = Arc::new(bytes);
|
||||
let len = bytes.len();
|
||||
let data = SharedData::new(bytes, 0, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
DocIndexes::from_cursor(&mut cursor)
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let ranges = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let indexes = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||
self.ranges().get(index).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
@ -92,6 +62,32 @@ impl Index<usize> for DocIndexes {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for DocIndexes {
|
||||
type Error = io::Error;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let ranges = cursor.extract(len);
|
||||
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let indexes = cursor.extract(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for DocIndexes {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
indexes: Vec<DocIndex>,
|
||||
@ -147,29 +143,32 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::error::Error;
|
||||
use crate::{Attribute, WordArea};
|
||||
|
||||
use crate::DocumentId;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: Attribute::new_faillible(3, 11),
|
||||
word_area: WordArea::new_faillible(30, 4)
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: Attribute::new_faillible(4, 21),
|
||||
word_area: WordArea::new_faillible(35, 6)
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: Attribute::new_faillible(8, 2),
|
||||
word_area: WordArea::new_faillible(89, 6)
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
@ -193,18 +192,24 @@ mod tests {
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: Attribute::new_faillible(3, 11),
|
||||
word_area: WordArea::new_faillible(30, 4)
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: Attribute::new_faillible(4, 21),
|
||||
word_area: WordArea::new_faillible(35, 6)
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: Attribute::new_faillible(8, 2),
|
||||
word_area: WordArea::new_faillible(89, 6)
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
@ -1,55 +1,13 @@
|
||||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
mod shared_data;
|
||||
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedData {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::new(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
pub use self::shared_data::SharedData;
|
||||
|
||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
|
48
src/data/shared_data.rs
Normal file
48
src/data/shared_data.rs
Normal file
@ -0,0 +1,48 @@
|
||||
use std::sync::Arc;
|
||||
use std::ops::Deref;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedData {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::from(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
46
src/database/config.rs
Normal file
46
src/database/config.rs
Normal file
@ -0,0 +1,46 @@
|
||||
use std::collections::{HashSet, HashMap};
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum RankingOrdering {
|
||||
Asc,
|
||||
Dsc
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct AccessToken {
|
||||
pub read_key: String,
|
||||
pub write_key: String,
|
||||
pub admin_key: String,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct Config {
|
||||
pub stop_words: Option<HashSet<String>>,
|
||||
pub ranking_order: Option<Vec<String>>,
|
||||
pub distinct_field: Option<String>,
|
||||
pub ranking_rules: Option<HashMap<String, RankingOrdering>>,
|
||||
pub access_token: Option<AccessToken>,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
pub fn update_with(&mut self, new: Config) {
|
||||
if let Some(stop_words) = new.stop_words {
|
||||
self.stop_words = Some(stop_words);
|
||||
};
|
||||
if let Some(ranking_order) = new.ranking_order {
|
||||
self.ranking_order = Some(ranking_order);
|
||||
};
|
||||
if let Some(distinct_field) = new.distinct_field {
|
||||
self.distinct_field = Some(distinct_field);
|
||||
};
|
||||
if let Some(ranking_rules) = new.ranking_rules {
|
||||
self.ranking_rules = Some(ranking_rules);
|
||||
};
|
||||
if let Some(access_token) = new.access_token {
|
||||
self.access_token = Some(access_token);
|
||||
};
|
||||
}
|
||||
}
|
@ -38,6 +38,10 @@ impl DocumentKey {
|
||||
DocumentKeyAttr::new(self.document_id(), attr)
|
||||
}
|
||||
|
||||
pub fn with_attribute_min(&self) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(self.document_id(), SchemaAttr::min())
|
||||
}
|
||||
|
||||
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
|
||||
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
|
||||
}
|
||||
|
@ -1,60 +1,45 @@
|
||||
use std::io::{Write, BufRead, Cursor};
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
use sdset::{Set, SetOperation};
|
||||
use sdset::duo::Union;
|
||||
use fst::{map, Map, IntoStreamer, Streamer};
|
||||
use fst::raw::Fst;
|
||||
use sdset::duo::{Union, DifferenceByKey};
|
||||
use sdset::{Set, SetOperation};
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Positive {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
pub struct Index {
|
||||
pub map: Map,
|
||||
pub indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl Positive {
|
||||
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
|
||||
Positive { map, indexes }
|
||||
impl Index {
|
||||
pub fn remove_documents(&self, documents: &Set<DocumentId>) -> Index {
|
||||
let mut buffer = Vec::new();
|
||||
let mut builder = IndexBuilder::new();
|
||||
let mut stream = self.into_stream();
|
||||
|
||||
while let Some((key, indexes)) = stream.next() {
|
||||
buffer.clear();
|
||||
|
||||
let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x);
|
||||
op.extend_vec(&mut buffer);
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let data = cursor.get_ref().range(offset, len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
cursor.consume(len);
|
||||
|
||||
let indexes = DocIndexes::from_cursor(cursor)?;
|
||||
|
||||
Ok(Positive { map, indexes})
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
pub fn union(&self, other: &Index) -> Index {
|
||||
let mut builder = IndexBuilder::new();
|
||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
@ -63,19 +48,19 @@ impl Positive {
|
||||
match ivalues {
|
||||
[a, b] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
let indexes = &indexes[a.value as usize];
|
||||
let a = Set::new_unchecked(indexes);
|
||||
|
||||
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
|
||||
let indexes = &indexes[b.value as usize];
|
||||
let b = Set::new_unchecked(indexes);
|
||||
|
||||
let op = Union::new(a, b);
|
||||
op.extend_vec(&mut buffer);
|
||||
},
|
||||
[a] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
[x] => {
|
||||
let indexes = if x.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = &indexes[x.value as usize];
|
||||
buffer.extend_from_slice(indexes)
|
||||
},
|
||||
_ => continue,
|
||||
@ -83,23 +68,45 @@ impl Positive {
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
builder.insert(key, indexes).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(Positive { map, indexes })
|
||||
builder.build()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
|
||||
impl FromSharedDataCursor for Index {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Index, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let data = cursor.extract(len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
|
||||
let indexes = DocIndexes::from_shared_data_cursor(cursor)?;
|
||||
|
||||
Ok(Index { map, indexes})
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for Index {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m Index {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
/// The type of the stream to be constructed.
|
||||
type Into = Stream<'m>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into {
|
||||
Stream {
|
||||
map_stream: self.map.into_stream(),
|
||||
@ -128,28 +135,26 @@ impl<'m, 'a> Streamer<'a> for Stream<'m> {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBuilder<W, X> {
|
||||
map: fst::MapBuilder<W>,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
pub struct IndexBuilder {
|
||||
map: fst::MapBuilder<Vec<u8>>,
|
||||
indexes: DocIndexesBuilder<Vec<u8>>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
PositiveBuilder {
|
||||
impl IndexBuilder {
|
||||
pub fn new() -> Self {
|
||||
IndexBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
||||
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> fst::Result<()>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
@ -158,9 +163,13 @@ impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
Ok((map, indexes))
|
||||
pub fn build(self) -> Index {
|
||||
let map = self.map.into_inner().unwrap();
|
||||
let indexes = self.indexes.into_inner().unwrap();
|
||||
|
||||
let map = Map::from_bytes(map).unwrap();
|
||||
let indexes = DocIndexes::from_bytes(indexes).unwrap();
|
||||
|
||||
Index { map, indexes }
|
||||
}
|
||||
}
|
@ -1,82 +0,0 @@
|
||||
mod negative;
|
||||
mod positive;
|
||||
|
||||
pub(crate) use self::negative::Negative;
|
||||
pub(crate) use self::positive::{Positive, PositiveBuilder};
|
||||
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use fst::Map;
|
||||
|
||||
use crate::data::{SharedData, DocIndexes};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Index {
|
||||
pub(crate) negative: Negative,
|
||||
pub(crate) positive: Positive,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
|
||||
let len = bytes.len();
|
||||
Index::from_shared_bytes(Arc::new(bytes), 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<Index, Box<Error>>
|
||||
{
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
|
||||
let negative = Negative::from_cursor(&mut cursor)?;
|
||||
let positive = Positive::from_cursor(&mut cursor)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
self.negative.write_to_bytes(bytes);
|
||||
self.positive.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
|
||||
if other.negative.is_empty() {
|
||||
let negative = Negative::default();
|
||||
let positive = self.positive.union(&other.positive)?;
|
||||
return Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = self.positive.into_stream();
|
||||
while let Some((key, indexes)) = stream.next() {
|
||||
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
|
||||
|
||||
buffer.clear();
|
||||
op.extend_vec(&mut buffer);
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
}
|
||||
|
||||
let positive = {
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let negative = Negative::default();
|
||||
let positive = positive.union(&other.positive)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::ops::Deref;
|
||||
|
||||
use sdset::Set;
|
||||
use byteorder::{LittleEndian, WriteBytesExt};
|
||||
|
||||
use crate::data::SharedData;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Negative(DocIds);
|
||||
|
||||
impl Negative {
|
||||
pub fn new(doc_ids: DocIds) -> Negative {
|
||||
Negative(doc_ids)
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
|
||||
let doc_ids = DocIds::from_cursor(cursor)?;
|
||||
Ok(Negative(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.0.as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Negative {
|
||||
type Target = Set<DocumentId>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
@ -1,27 +1,48 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
use std::error::Error;
|
||||
use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use std::ffi::OsStr;
|
||||
use std::sync::Arc;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
|
||||
use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions};
|
||||
use rocksdb::rocksdb::{Writable, Snapshot};
|
||||
use rocksdb::{DB, DBVector, MergeOperands};
|
||||
use crossbeam::atomic::ArcCell;
|
||||
use log::debug;
|
||||
use rocksdb::{DB, MergeOperands};
|
||||
use size_format::SizeFormatterBinary;
|
||||
use arc_swap::ArcSwap;
|
||||
use lockfree::map::Map;
|
||||
use hashbrown::HashMap;
|
||||
use log::{info, error, warn};
|
||||
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::shared_data_cursor::FromSharedDataCursor;
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::DocumentId;
|
||||
|
||||
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
|
||||
|
||||
pub use self::config::Config;
|
||||
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
pub use self::view::{DatabaseView, DocumentIter};
|
||||
pub use self::update::{Update, UpdateBuilder};
|
||||
pub use self::update::Update;
|
||||
pub use self::serde::SerializerError;
|
||||
pub use self::schema::Schema;
|
||||
pub use self::index::Index;
|
||||
pub use self::number::{Number, ParseNumberError};
|
||||
|
||||
const DATA_INDEX: &[u8] = b"data-index";
|
||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
|
||||
|
||||
const DATA_INDEX: &[u8] = b"data-index";
|
||||
const DATA_RANKED_MAP: &[u8] = b"data-ranked-map";
|
||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||
const CONFIG: &[u8] = b"config";
|
||||
|
||||
pub mod config;
|
||||
pub mod schema;
|
||||
pub(crate) mod index;
|
||||
mod deserializer;
|
||||
mod number;
|
||||
mod document_key;
|
||||
mod serde;
|
||||
mod update;
|
||||
@ -39,64 +60,150 @@ where D: Deref<Target=DB>
|
||||
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
let index = match snapshot.get(DATA_INDEX)? {
|
||||
Some(vector) => {
|
||||
let bytes = vector.as_ref().to_vec();
|
||||
Index::from_bytes(bytes)?
|
||||
},
|
||||
None => Index::default(),
|
||||
};
|
||||
let start = Instant::now();
|
||||
let vector = snapshot.get(DATA_INDEX)?;
|
||||
info!("loading index from kv-store took {:.2?}", start.elapsed());
|
||||
|
||||
Ok(index)
|
||||
match vector {
|
||||
Some(vector) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let bytes = vector.as_ref().to_vec();
|
||||
info!("index size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
|
||||
|
||||
let event = ReadIndexEvent::from_bytes(bytes)?;
|
||||
let index = event.updated_documents().expect("BUG: invalid event deserialized");
|
||||
|
||||
info!("loading index from bytes took {:.2?}", start.elapsed());
|
||||
|
||||
Ok(index)
|
||||
},
|
||||
None => Ok(Index::default()),
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
|
||||
fn retrieve_data_ranked_map<D>(snapshot: &Snapshot<D>) -> Result<RankedMap, Box<Error>>
|
||||
where D: Deref<Target=DB>,
|
||||
{
|
||||
let start = Instant::now();
|
||||
let vector = snapshot.get(DATA_RANKED_MAP)?;
|
||||
info!("loading ranked map from kv-store took {:.2?}", start.elapsed());
|
||||
|
||||
let mut index: Option<Index> = None;
|
||||
match vector {
|
||||
Some(vector) => {
|
||||
let start = Instant::now();
|
||||
|
||||
let bytes = vector.as_ref().to_vec();
|
||||
info!("ranked map size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
|
||||
|
||||
let event = ReadRankedMapEvent::from_bytes(bytes)?;
|
||||
let ranked_map = event.updated_documents().expect("BUG: invalid event deserialized");
|
||||
|
||||
info!("loading ranked map from bytes took {:.2?}", start.elapsed());
|
||||
|
||||
Ok(ranked_map)
|
||||
},
|
||||
None => Ok(RankedMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
fn retrieve_config<D>(snapshot: &Snapshot<D>) -> Result<Config, Box<Error>>
|
||||
where D: Deref<Target=DB>,
|
||||
{
|
||||
match snapshot.get(CONFIG)? {
|
||||
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||
None => Ok(Config::default()),
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
use self::update::ReadIndexEvent::{self, *};
|
||||
use self::update::WriteIndexEvent;
|
||||
|
||||
let mut index = Index::default();
|
||||
for bytes in existing.into_iter().chain(operands) {
|
||||
let operand = Index::from_bytes(bytes.to_vec()).unwrap();
|
||||
let merged = match index {
|
||||
Some(ref index) => index.merge(&operand).unwrap(),
|
||||
None => operand,
|
||||
};
|
||||
|
||||
index.replace(merged);
|
||||
match ReadIndexEvent::from_bytes(bytes.to_vec()).unwrap() {
|
||||
RemovedDocuments(d) => index = index.remove_documents(d.as_ref()),
|
||||
UpdatedDocuments(i) => index = index.union(&i),
|
||||
}
|
||||
}
|
||||
|
||||
let index = index.unwrap_or_default();
|
||||
let mut bytes = Vec::new();
|
||||
index.write_to_bytes(&mut bytes);
|
||||
bytes
|
||||
WriteIndexEvent::UpdatedDocuments(&index).into_bytes()
|
||||
}
|
||||
|
||||
pub struct Database {
|
||||
// DB is under a Mutex to sync update ingestions and separate DB update locking
|
||||
// and DatabaseView acquiring locking in other words:
|
||||
// "Block readers the minimum possible amount of time"
|
||||
db: Mutex<Arc<DB>>,
|
||||
fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
use self::update::ReadRankedMapEvent::{self, *};
|
||||
use self::update::WriteRankedMapEvent;
|
||||
|
||||
// This view is updated each time the DB ingests an update
|
||||
view: ArcCell<DatabaseView<Arc<DB>>>,
|
||||
let mut ranked_map = RankedMap::default();
|
||||
for bytes in existing.into_iter().chain(operands) {
|
||||
match ReadRankedMapEvent::from_bytes(bytes.to_vec()).unwrap() {
|
||||
RemovedDocuments(d) => ranked_map.retain(|(k, _), _| !d.as_ref().binary_search(k).is_ok()),
|
||||
UpdatedDocuments(i) => ranked_map.extend(i),
|
||||
}
|
||||
}
|
||||
|
||||
WriteRankedMapEvent::UpdatedDocuments(&ranked_map).into_bytes()
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<Database, Box<Error>> {
|
||||
fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
match key {
|
||||
DATA_INDEX => merge_indexes(existing, operands),
|
||||
DATA_RANKED_MAP => merge_ranked_maps(existing, operands),
|
||||
key => panic!("The merge operator does not support merging {:?}", key),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IndexUpdate {
|
||||
index: String,
|
||||
update: Update,
|
||||
}
|
||||
|
||||
impl Deref for IndexUpdate {
|
||||
type Target = Update;
|
||||
|
||||
fn deref(&self) -> &Update {
|
||||
&self.update
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for IndexUpdate {
|
||||
fn deref_mut(&mut self) -> &mut Update {
|
||||
&mut self.update
|
||||
}
|
||||
}
|
||||
|
||||
struct DatabaseIndex {
|
||||
db: Arc<DB>,
|
||||
|
||||
// This view is updated each time the DB ingests an update.
|
||||
view: ArcSwap<DatabaseView<Arc<DB>>>,
|
||||
|
||||
// The path of the mdb folder stored on disk.
|
||||
path: PathBuf,
|
||||
|
||||
// must_die false by default, must be set as true when the Index is dropped.
|
||||
// It is used to erase the folder saved on disk when the user request to delete an index.
|
||||
must_die: AtomicBool,
|
||||
}
|
||||
|
||||
impl DatabaseIndex {
|
||||
fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<DatabaseIndex, Box<Error>> {
|
||||
let path = path.as_ref();
|
||||
if path.exists() {
|
||||
return Err(format!("File already exists at path: {}, cannot create database.",
|
||||
path.display()).into())
|
||||
}
|
||||
|
||||
let path = path.to_string_lossy();
|
||||
let path_lossy = path.to_string_lossy();
|
||||
let mut opts = DBOptions::new();
|
||||
opts.create_if_missing(true);
|
||||
// opts.error_if_exists(true); // FIXME pull request that
|
||||
|
||||
let mut cf_opts = ColumnFamilyOptions::new();
|
||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||
cf_opts.add_merge_operator("data merge operator", merge_operator);
|
||||
|
||||
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
|
||||
|
||||
let mut schema_bytes = Vec::new();
|
||||
schema.write_to_bin(&mut schema_bytes)?;
|
||||
@ -104,21 +211,26 @@ impl Database {
|
||||
|
||||
let db = Arc::new(db);
|
||||
let snapshot = Snapshot::new(db.clone());
|
||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
|
||||
Ok(Database { db: Mutex::new(db), view })
|
||||
Ok(DatabaseIndex {
|
||||
db: db,
|
||||
view: view,
|
||||
path: path.to_path_buf(),
|
||||
must_die: AtomicBool::new(false)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||
let path = path.as_ref().to_string_lossy();
|
||||
fn open<P: AsRef<Path>>(path: P) -> Result<DatabaseIndex, Box<Error>> {
|
||||
let path_lossy = path.as_ref().to_string_lossy();
|
||||
|
||||
let mut opts = DBOptions::new();
|
||||
opts.create_if_missing(false);
|
||||
|
||||
let mut cf_opts = ColumnFamilyOptions::new();
|
||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||
cf_opts.add_merge_operator("data merge operator", merge_operator);
|
||||
|
||||
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
|
||||
|
||||
// FIXME create a generic function to do that !
|
||||
let _schema = match db.get(DATA_SCHEMA)? {
|
||||
@ -128,79 +240,209 @@ impl Database {
|
||||
|
||||
let db = Arc::new(db);
|
||||
let snapshot = Snapshot::new(db.clone());
|
||||
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
|
||||
|
||||
Ok(Database { db: Mutex::new(db), view })
|
||||
Ok(DatabaseIndex {
|
||||
db: db,
|
||||
view: view,
|
||||
path: path.as_ref().to_path_buf(),
|
||||
must_die: AtomicBool::new(false)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn ingest_update_file(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let snapshot = {
|
||||
// We must have a mutex here to ensure that update ingestions and compactions
|
||||
// are done atomatically and in the right order.
|
||||
// This way update ingestions will block other update ingestions without blocking view
|
||||
// creations while doing the "data-index" compaction
|
||||
let db = match self.db.lock() {
|
||||
Ok(db) => db,
|
||||
Err(e) => return Err(e.to_string().into()),
|
||||
};
|
||||
fn must_die(&self) {
|
||||
self.must_die.store(true, Ordering::Relaxed)
|
||||
}
|
||||
|
||||
let path = update.path().to_string_lossy();
|
||||
let options = IngestExternalFileOptions::new();
|
||||
// options.move_files(move_update);
|
||||
|
||||
debug!("ingest update file");
|
||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
||||
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
|
||||
|
||||
debug!("compacting index range");
|
||||
// Compacting to trigger the merge operator only one time
|
||||
// while ingesting the update and not each time searching
|
||||
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
|
||||
|
||||
Snapshot::new(db.clone())
|
||||
fn start_update(&self) -> Result<Update, Box<Error>> {
|
||||
let schema = match self.db.get(DATA_SCHEMA)? {
|
||||
Some(value) => Schema::read_from_bin(&*value)?,
|
||||
None => panic!("Database does not contain a schema"),
|
||||
};
|
||||
|
||||
Ok(Update::new(schema))
|
||||
}
|
||||
|
||||
fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let batch = update.build()?;
|
||||
self.db.write(batch)?;
|
||||
|
||||
let snapshot = Snapshot::new(self.db.clone());
|
||||
let view = Arc::new(DatabaseView::new(snapshot)?);
|
||||
self.view.set(view.clone());
|
||||
self.view.store(view.clone());
|
||||
|
||||
Ok(view)
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
|
||||
self.view().get(key)
|
||||
fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
|
||||
self.view.load()
|
||||
}
|
||||
|
||||
pub fn flush(&self) -> Result<(), Box<Error>> {
|
||||
match self.db.lock() {
|
||||
Ok(db) => Ok(db.flush(true)?),
|
||||
Err(e) => Err(e.to_string().into()),
|
||||
fn get_config(&self) -> Config {
|
||||
self.view().config().clone()
|
||||
}
|
||||
|
||||
fn update_config(&self, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
|
||||
let data = bincode::serialize(&config)?;
|
||||
self.db.put(CONFIG, &data)?;
|
||||
|
||||
let snapshot = Snapshot::new(self.db.clone());
|
||||
let view = Arc::new(DatabaseView::new(snapshot)?);
|
||||
self.view.store(view.clone());
|
||||
|
||||
Ok(view)
|
||||
}
|
||||
|
||||
fn path(&self) -> &Path {
|
||||
self.path.as_path()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DatabaseIndex {
|
||||
fn drop(&mut self) {
|
||||
if self.must_die.load(Ordering::Relaxed) {
|
||||
if let Err(err) = fs::remove_dir_all(&self.path) {
|
||||
error!("Impossible to remove mdb when Database is dropped; {}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
|
||||
self.view.get()
|
||||
pub struct Database {
|
||||
indexes: Map<String, Arc<DatabaseIndex>>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||
Ok(Database {
|
||||
indexes: Map::new(),
|
||||
path: path.as_ref().to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
|
||||
let entries = fs::read_dir(&path)?;
|
||||
|
||||
let indexes = Map::new();
|
||||
for entry in entries {
|
||||
let path = match entry {
|
||||
Ok(p) => p.path(),
|
||||
Err(err) => {
|
||||
warn!("Impossible to retrieve the path from an entry; {}", err);
|
||||
continue
|
||||
}
|
||||
};
|
||||
|
||||
let name = match path.file_stem().and_then(OsStr::to_str) {
|
||||
Some(name) => name.to_owned(),
|
||||
None => continue
|
||||
};
|
||||
|
||||
let db = match DatabaseIndex::open(path.clone()) {
|
||||
Ok(db) => db,
|
||||
Err(err) => {
|
||||
warn!("Impossible to open the database; {}", err);
|
||||
continue
|
||||
}
|
||||
};
|
||||
|
||||
info!("Load database {}", name);
|
||||
indexes.insert(name, Arc::new(db));
|
||||
}
|
||||
|
||||
Ok(Database {
|
||||
indexes: indexes,
|
||||
path: path.as_ref().to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box<Error>> {
|
||||
let index_path = self.path.join(name);
|
||||
|
||||
if index_path.exists() {
|
||||
return Err("Index already exists".into());
|
||||
}
|
||||
|
||||
let index = DatabaseIndex::create(index_path, schema)?;
|
||||
self.indexes.insert(name.to_owned(), Arc::new(index));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn delete_index(&self, name: &str) -> Result<(), Box<Error>> {
|
||||
let index_guard = self.indexes.remove(name).ok_or("Index not found")?;
|
||||
index_guard.val().must_die();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn list_indexes(&self) -> Vec<String> {
|
||||
self.indexes.iter().map(|g| g.key().clone()).collect()
|
||||
}
|
||||
|
||||
pub fn start_update(&self, index: &str) -> Result<IndexUpdate, Box<Error>> {
|
||||
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||
let update = index_guard.val().start_update()?;
|
||||
|
||||
Ok(IndexUpdate { index: index.to_owned(), update })
|
||||
}
|
||||
|
||||
pub fn commit_update(&self, update: IndexUpdate)-> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?;
|
||||
|
||||
index_guard.val().commit_update(update.update)
|
||||
}
|
||||
|
||||
pub fn view(&self, index: &str) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
|
||||
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||
|
||||
Ok(index_guard.val().view())
|
||||
}
|
||||
|
||||
pub fn get_config(&self, index: &str) -> Result<Config, Box<Error>> {
|
||||
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||
|
||||
Ok(index_guard.val().get_config())
|
||||
}
|
||||
|
||||
pub fn update_config(&self, index: &str, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
|
||||
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||
|
||||
Ok(index_guard.val().update_config(config)?)
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path {
|
||||
self.path.as_path()
|
||||
}
|
||||
|
||||
pub fn index_path(&self, index: &str) -> Result<PathBuf, Box<Error>> {
|
||||
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
|
||||
let path = index_guard.val().path();
|
||||
Ok(path.to_path_buf())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::collections::HashSet;
|
||||
use std::error::Error;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use hashbrown::HashSet;
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
use crate::database::update::UpdateBuilder;
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
fn ingest_one_easy_update() -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
let meilidb_path = dir.path().join("meilidb.mdb");
|
||||
let meilidb_index_name = "default";
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct SimpleDoc {
|
||||
@ -219,9 +461,9 @@ mod tests {
|
||||
builder.build()
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, &schema)?;
|
||||
let database = Database::create(&meilidb_path)?;
|
||||
|
||||
let update_path = dir.path().join("update.sst");
|
||||
database.create_index(meilidb_index_name, &schema)?;
|
||||
|
||||
let doc0 = SimpleDoc {
|
||||
id: 0,
|
||||
@ -236,20 +478,13 @@ mod tests {
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let docid0;
|
||||
let docid1;
|
||||
let update = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
database.ingest_update_file(update)?;
|
||||
let view = database.view();
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||
@ -261,11 +496,12 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
fn ingest_two_easy_updates() -> Result<(), Box<Error>> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let stop_words = HashSet::new();
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
let meilidb_path = dir.path().join("meilidb.mdb");
|
||||
let meilidb_index_name = "default";
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct SimpleDoc {
|
||||
@ -284,7 +520,9 @@ mod tests {
|
||||
builder.build()
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, &schema)?;
|
||||
let database = Database::create(&meilidb_path)?;
|
||||
|
||||
database.create_index(meilidb_index_name, &schema)?;
|
||||
|
||||
let doc0 = SimpleDoc {
|
||||
id: 0,
|
||||
@ -311,36 +549,17 @@ mod tests {
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let docid0;
|
||||
let docid1;
|
||||
let update1 = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = dir.path().join("update-000.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
|
||||
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
|
||||
database.commit_update(builder)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
let docid2;
|
||||
let docid3;
|
||||
let update2 = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = dir.path().join("update-001.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
database.ingest_update_file(update1)?;
|
||||
database.ingest_update_file(update2)?;
|
||||
|
||||
let view = database.view();
|
||||
let mut builder = database.start_update(meilidb_index_name)?;
|
||||
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
|
||||
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||
@ -362,7 +581,7 @@ mod tests {
|
||||
mod bench {
|
||||
extern crate test;
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashSet;
|
||||
use std::error::Error;
|
||||
use std::iter::repeat_with;
|
||||
use self::test::Bencher;
|
||||
@ -372,12 +591,12 @@ mod bench {
|
||||
use rand::{Rng, SeedableRng};
|
||||
use serde_derive::Serialize;
|
||||
use rand::seq::SliceRandom;
|
||||
use hashbrown::HashSet;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::update::UpdateBuilder;
|
||||
use crate::database::schema::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
|
||||
let mut words = String::new();
|
||||
|
||||
@ -409,7 +628,10 @@ mod bench {
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
@ -418,9 +640,8 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..300 {
|
||||
@ -432,8 +653,7 @@ mod bench {
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
database.commit_update(builder)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
@ -456,7 +676,10 @@ mod bench {
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
@ -465,9 +688,8 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..3000 {
|
||||
@ -479,8 +701,7 @@ mod bench {
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
database.commit_update(builder)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
@ -504,7 +725,10 @@ mod bench {
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
@ -513,9 +737,8 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..30_000 {
|
||||
@ -527,8 +750,7 @@ mod bench {
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
database.commit_update(builder)?;
|
||||
|
||||
drop(database);
|
||||
|
||||
@ -551,7 +773,10 @@ mod bench {
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
@ -560,9 +785,8 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..300 {
|
||||
@ -574,12 +798,11 @@ mod bench {
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
let view = database.ingest_update_file(update)?;
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
let documents = view.query_builder().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
@ -598,7 +821,10 @@ mod bench {
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
@ -607,9 +833,8 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..3000 {
|
||||
@ -621,12 +846,11 @@ mod bench {
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
let view = database.ingest_update_file(update)?;
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
let documents = view.query_builder().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
@ -646,7 +870,10 @@ mod bench {
|
||||
let schema = builder.build();
|
||||
|
||||
let db_path = dir.path().join("bench.mdb");
|
||||
let database = Database::create(db_path.clone(), &schema)?;
|
||||
let index_name = "default";
|
||||
|
||||
let database = Database::create(&db_path)?;
|
||||
database.create_index(index_name, &schema)?;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Document {
|
||||
@ -655,9 +882,8 @@ mod bench {
|
||||
description: String,
|
||||
}
|
||||
|
||||
let path = dir.path().join("update-000.sst");
|
||||
let tokenizer_builder = DefaultBuilder;
|
||||
let mut builder = UpdateBuilder::new(path, schema);
|
||||
let mut builder = database.start_update(index_name)?;
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for i in 0..30_000 {
|
||||
@ -669,12 +895,11 @@ mod bench {
|
||||
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
|
||||
}
|
||||
|
||||
let update = builder.build()?;
|
||||
let view = database.ingest_update_file(update)?;
|
||||
let view = database.commit_update(builder)?;
|
||||
|
||||
bench.iter(|| {
|
||||
for q in &["a", "b", "c", "d", "e"] {
|
||||
let documents = view.query_builder().unwrap().query(q, 0..20);
|
||||
let documents = view.query_builder().query(q, 0..20);
|
||||
test::black_box(|| documents);
|
||||
}
|
||||
});
|
||||
|
98
src/database/number.rs
Normal file
98
src/database/number.rs
Normal file
@ -0,0 +1,98 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::str::FromStr;
|
||||
use std::fmt;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum Number {
|
||||
Unsigned(u64),
|
||||
Signed(i64),
|
||||
Float(f64),
|
||||
}
|
||||
|
||||
impl FromStr for Number {
|
||||
type Err = ParseNumberError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
if let Ok(unsigned) = u64::from_str(s) {
|
||||
return Ok(Number::Unsigned(unsigned))
|
||||
}
|
||||
|
||||
if let Ok(signed) = i64::from_str(s) {
|
||||
return Ok(Number::Signed(signed))
|
||||
}
|
||||
|
||||
if let Ok(float) = f64::from_str(s) {
|
||||
if float == 0.0 || float.is_normal() {
|
||||
return Ok(Number::Float(float))
|
||||
}
|
||||
}
|
||||
|
||||
Err(ParseNumberError)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Number {
|
||||
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Number {
|
||||
fn cmp(&self, other: &Number) -> Ordering {
|
||||
use Number::*;
|
||||
match (self, other) {
|
||||
(Unsigned(s), Unsigned(o)) => s.cmp(o),
|
||||
(Unsigned(s), Signed(o)) => {
|
||||
let s = i128::from(*s);
|
||||
let o = i128::from(*o);
|
||||
s.cmp(&o)
|
||||
},
|
||||
(Unsigned(s), Float(o)) => {
|
||||
let s = *s as f64;
|
||||
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
|
||||
},
|
||||
|
||||
(Signed(s), Unsigned(o)) => {
|
||||
let s = i128::from(*s);
|
||||
let o = i128::from(*o);
|
||||
s.cmp(&o)
|
||||
},
|
||||
(Signed(s), Signed(o)) => s.cmp(o),
|
||||
(Signed(s), Float(o)) => {
|
||||
let s = *s as f64;
|
||||
s.partial_cmp(o).unwrap_or(Ordering::Equal)
|
||||
},
|
||||
|
||||
(Float(s), Unsigned(o)) => {
|
||||
let o = *o as f64;
|
||||
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
|
||||
},
|
||||
(Float(s), Signed(o)) => {
|
||||
let o = *o as f64;
|
||||
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
|
||||
},
|
||||
(Float(s), Float(o)) => {
|
||||
s.partial_cmp(o).unwrap_or(Ordering::Equal)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Number {
|
||||
fn eq(&self, other: &Number) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Number { }
|
||||
|
||||
pub struct ParseNumberError;
|
||||
|
||||
impl fmt::Display for ParseNumberError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.write_str("can not parse number")
|
||||
}
|
||||
}
|
@ -7,14 +7,14 @@ use std::sync::Arc;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
|
||||
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
|
||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
|
||||
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
|
||||
pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true };
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct SchemaProps {
|
||||
@ -23,6 +23,9 @@ pub struct SchemaProps {
|
||||
|
||||
#[serde(default)]
|
||||
indexed: bool,
|
||||
|
||||
#[serde(default)]
|
||||
ranked: bool,
|
||||
}
|
||||
|
||||
impl SchemaProps {
|
||||
@ -33,6 +36,10 @@ impl SchemaProps {
|
||||
pub fn is_indexed(self) -> bool {
|
||||
self.indexed
|
||||
}
|
||||
|
||||
pub fn is_ranked(self) -> bool {
|
||||
self.ranked
|
||||
}
|
||||
}
|
||||
|
||||
impl BitOr for SchemaProps {
|
||||
@ -42,6 +49,7 @@ impl BitOr for SchemaProps {
|
||||
SchemaProps {
|
||||
stored: self.stored | other.stored,
|
||||
indexed: self.indexed | other.indexed,
|
||||
ranked: self.ranked | other.ranked,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -113,6 +121,23 @@ impl Schema {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
|
||||
let mut buffer = Vec::new();
|
||||
reader.read_to_end(&mut buffer)?;
|
||||
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
|
||||
let identifier = self.inner.identifier.clone();
|
||||
let attributes = self.attributes_ordered();
|
||||
let builder = SchemaBuilder { identifier, attributes };
|
||||
let string = serde_json::to_string_pretty(&builder)?;
|
||||
writer.write_all(string.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
|
||||
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
|
||||
Ok(builder.build())
|
||||
@ -142,7 +167,7 @@ impl Schema {
|
||||
}
|
||||
|
||||
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
where T: serde::Serialize,
|
||||
{
|
||||
let id_attribute_name = &self.inner.identifier;
|
||||
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
||||
@ -168,7 +193,8 @@ impl Schema {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct SchemaAttr(pub(crate) u16);
|
||||
|
||||
impl SchemaAttr {
|
||||
@ -254,4 +280,40 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize_json() -> Result<(), Box<Error>> {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", STORED);
|
||||
builder.new_attribute("beta", STORED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
schema.to_json(&mut buffer)?;
|
||||
|
||||
let schema2 = Schema::from_json(buffer.as_slice())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
let data = r#"
|
||||
{
|
||||
"identifier": "id",
|
||||
"attributes": {
|
||||
"alpha": {
|
||||
"stored": true
|
||||
},
|
||||
"beta": {
|
||||
"stored": true,
|
||||
"indexed": true
|
||||
},
|
||||
"gamma": {
|
||||
"indexed": true
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
let schema2 = Schema::from_json(data.as_bytes())?;
|
||||
assert_eq!(schema, schema2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -1,23 +1,24 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::Token;
|
||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||
use crate::{is_cjk, DocumentId, DocIndex};
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
pub struct IndexerSerializer<'a, B> {
|
||||
pub struct IndexerSerializer<'a, 'b, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
@ -54,10 +55,8 @@ where B: TokenizerBuilder
|
||||
let document_id = self.document_id;
|
||||
|
||||
// FIXME must u32::try_from instead
|
||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
||||
Ok(attribute) => attribute,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
let attribute = self.attribute.0;
|
||||
let word_index = word_index as u32;
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
@ -66,24 +65,23 @@ where B: TokenizerBuilder
|
||||
if self.stop_words.contains(&word_lower) { continue }
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
let word_area = match WordArea::new(char_index as u32, length) {
|
||||
Ok(word_area) => word_area,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
if !word_lower.chars().any(is_cjk) {
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
let word_unidecoded = word_unidecoded.trim();
|
||||
if word_lower != word_unidecoded {
|
||||
let char_index = char_index as u32;
|
||||
let char_length = length;
|
||||
|
||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
||||
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
||||
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||
self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?;
|
||||
}
|
||||
}
|
||||
|
||||
let word_area = match WordArea::new(char_index as u32, length) {
|
||||
Ok(word_area) => word_area,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
let char_index = char_index as u32;
|
||||
let char_length = length;
|
||||
|
||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
||||
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
|
||||
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||
self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -17,8 +17,10 @@ macro_rules! forward_to_unserializable_type {
|
||||
|
||||
pub mod find_id;
|
||||
pub mod key_to_string;
|
||||
pub mod value_to_number;
|
||||
pub mod serializer;
|
||||
pub mod indexer_serializer;
|
||||
pub mod deserializer;
|
||||
|
||||
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s = DefaultHasher::new();
|
||||
@ -55,3 +57,9 @@ impl fmt::Display for SerializerError {
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
impl From<String> for SerializerError {
|
||||
fn from(value: String) -> SerializerError {
|
||||
SerializerError::Custom(value)
|
||||
}
|
||||
}
|
||||
|
@ -1,24 +1,26 @@
|
||||
use hashbrown::HashSet;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::serde::value_to_number::ValueToNumberSerializer;
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, B> {
|
||||
pub struct Serializer<'a, 'b, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
@ -27,8 +29,8 @@ where B: TokenizerBuilder
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a, B>;
|
||||
type SerializeStruct = StructSerializer<'a, B>;
|
||||
type SerializeMap = MapSerializer<'a, 'b, B>;
|
||||
type SerializeStruct = StructSerializer<'a, 'b, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
@ -154,8 +156,8 @@ where B: TokenizerBuilder
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
update: self.update,
|
||||
document_id: self.document_id,
|
||||
update: self.update,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
stop_words: self.stop_words,
|
||||
})
|
||||
@ -173,16 +175,16 @@ where B: TokenizerBuilder
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a, B> {
|
||||
pub struct MapSerializer<'a, 'b, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
pub current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
|
||||
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
@ -206,7 +208,7 @@ where B: TokenizerBuilder
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
value: &V,
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
@ -216,7 +218,7 @@ where B: TokenizerBuilder
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
self.update.insert_attribute_value(attr, value);
|
||||
self.update.insert_attribute_value(attr, &value)?;
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
@ -228,6 +230,10 @@ where B: TokenizerBuilder
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
if props.is_ranked() {
|
||||
let number = value.serialize(ValueToNumberSerializer)?;
|
||||
self.update.register_ranked_attribute(attr, number)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -238,15 +244,15 @@ where B: TokenizerBuilder
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a, B> {
|
||||
pub struct StructSerializer<'a, 'b, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub update: &'a mut DocumentUpdate,
|
||||
pub update: &'a mut DocumentUpdate<'b>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub stop_words: &'a HashSet<String>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
@ -263,7 +269,7 @@ where B: TokenizerBuilder
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
self.update.insert_attribute_value(attr, value);
|
||||
self.update.insert_attribute_value(attr, &value)?;
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
@ -275,6 +281,10 @@ where B: TokenizerBuilder
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
if props.is_ranked() {
|
||||
let integer = value.serialize(ValueToNumberSerializer)?;
|
||||
self.update.register_ranked_attribute(attr, integer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
176
src/database/serde/value_to_number.rs
Normal file
176
src/database/serde/value_to_number.rs
Normal file
@ -0,0 +1,176 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::{ser, ser::Error};
|
||||
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::Number;
|
||||
|
||||
pub struct ValueToNumberSerializer;
|
||||
|
||||
impl ser::Serializer for ValueToNumberSerializer {
|
||||
type Ok = Number;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
}
|
||||
|
||||
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(value as i64))
|
||||
}
|
||||
|
||||
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(value as i64))
|
||||
}
|
||||
|
||||
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(value as i64))
|
||||
}
|
||||
|
||||
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Signed(value as i64))
|
||||
}
|
||||
|
||||
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(value as u64))
|
||||
}
|
||||
|
||||
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(value as u64))
|
||||
}
|
||||
|
||||
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(value as u64))
|
||||
}
|
||||
|
||||
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Unsigned(value as u64))
|
||||
}
|
||||
|
||||
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Float(value as f64))
|
||||
}
|
||||
|
||||
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(Number::Float(value))
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Number::from_str(value).map_err(SerializerError::custom)
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use hashbrown::HashSet;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::Schema;
|
||||
|
||||
use crate::DocumentId;
|
||||
use super::{Update, RawUpdateBuilder};
|
||||
|
||||
pub struct UpdateBuilder {
|
||||
schema: Schema,
|
||||
raw_builder: RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl UpdateBuilder {
|
||||
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
|
||||
UpdateBuilder {
|
||||
schema: schema,
|
||||
raw_builder: RawUpdateBuilder::new(path),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_document<T, B>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
let update = self.raw_builder.document_update(document_id);
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
update: update,
|
||||
stop_words: stop_words,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
self.raw_builder.document_update(document_id).remove();
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
self.raw_builder.build()
|
||||
}
|
||||
}
|
55
src/database/update/index_event.rs
Normal file
55
src/database/update/index_event.rs
Normal file
@ -0,0 +1,55 @@
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::database::Index;
|
||||
use crate::data::DocIds;
|
||||
|
||||
pub enum WriteIndexEvent<'a> {
|
||||
RemovedDocuments(&'a DocIds),
|
||||
UpdatedDocuments(&'a Index),
|
||||
}
|
||||
|
||||
impl<'a> WriteToBytes for WriteIndexEvent<'a> {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
match self {
|
||||
WriteIndexEvent::RemovedDocuments(doc_ids) => {
|
||||
let _ = bytes.write_u8(0);
|
||||
doc_ids.write_to_bytes(bytes);
|
||||
},
|
||||
WriteIndexEvent::UpdatedDocuments(index) => {
|
||||
let _ = bytes.write_u8(1);
|
||||
index.write_to_bytes(bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ReadIndexEvent {
|
||||
RemovedDocuments(DocIds),
|
||||
UpdatedDocuments(Index),
|
||||
}
|
||||
|
||||
impl ReadIndexEvent {
|
||||
pub fn updated_documents(self) -> Option<Index> {
|
||||
use ReadIndexEvent::*;
|
||||
match self {
|
||||
RemovedDocuments(_) => None,
|
||||
UpdatedDocuments(index) => Some(index),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for ReadIndexEvent {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
|
||||
match cursor.read_u8()? {
|
||||
0 => DocIds::from_shared_data_cursor(cursor).map(ReadIndexEvent::RemovedDocuments),
|
||||
1 => Index::from_shared_data_cursor(cursor).map(ReadIndexEvent::UpdatedDocuments),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
@ -1,17 +1,239 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::collections::{HashSet, BTreeMap};
|
||||
use std::error::Error;
|
||||
|
||||
mod builder;
|
||||
mod raw_builder;
|
||||
use rocksdb::rocksdb::{Writable, WriteBatch};
|
||||
use hashbrown::hash_map::HashMap;
|
||||
use sdset::{Set, SetBuf};
|
||||
use serde::Serialize;
|
||||
|
||||
pub use self::builder::UpdateBuilder;
|
||||
pub use self::raw_builder::{RawUpdateBuilder, DocumentUpdate};
|
||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::database::index::IndexBuilder;
|
||||
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
||||
use crate::database::{RankedMap, Number};
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::DocIds;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
|
||||
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
||||
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
||||
|
||||
mod index_event;
|
||||
mod ranked_map_event;
|
||||
|
||||
pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
|
||||
pub struct Update {
|
||||
sst_file: PathBuf,
|
||||
schema: Schema,
|
||||
raw_builder: RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl Update {
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.sst_file
|
||||
pub(crate) fn new(schema: Schema) -> Update {
|
||||
Update { schema, raw_builder: RawUpdateBuilder::new() }
|
||||
}
|
||||
|
||||
pub fn update_document<T, B>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
update: &mut self.raw_builder.document_update(document_id)?,
|
||||
stop_words: stop_words,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
self.raw_builder.document_update(document_id)?.remove()?;
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> Result<WriteBatch, Box<Error>> {
|
||||
self.raw_builder.build()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
enum UpdateType {
|
||||
Updated,
|
||||
Deleted,
|
||||
}
|
||||
|
||||
use UpdateType::{Updated, Deleted};
|
||||
|
||||
pub struct RawUpdateBuilder {
|
||||
documents_update: HashMap<DocumentId, UpdateType>,
|
||||
documents_ranked_fields: RankedMap,
|
||||
indexed_words: BTreeMap<Token, Vec<DocIndex>>,
|
||||
batch: WriteBatch,
|
||||
}
|
||||
|
||||
impl RawUpdateBuilder {
|
||||
pub fn new() -> RawUpdateBuilder {
|
||||
RawUpdateBuilder {
|
||||
documents_update: HashMap::new(),
|
||||
documents_ranked_fields: HashMap::new(),
|
||||
indexed_words: BTreeMap::new(),
|
||||
batch: WriteBatch::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_update(&mut self, document_id: DocumentId) -> Result<DocumentUpdate, SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
match self.documents_update.get(&document_id) {
|
||||
Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }),
|
||||
Some(Updated) => Err(SerializerError::custom(
|
||||
"This document has already been removed and cannot be updated in the same update"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<WriteBatch, Box<Error>> {
|
||||
// create the list of all the removed documents
|
||||
let removed_documents = {
|
||||
let mut document_ids = Vec::new();
|
||||
for (id, update_type) in self.documents_update {
|
||||
if update_type == Deleted {
|
||||
document_ids.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
document_ids.sort_unstable();
|
||||
let setbuf = SetBuf::new_unchecked(document_ids);
|
||||
DocIds::new(&setbuf)
|
||||
};
|
||||
|
||||
// create the Index of all the document updates
|
||||
let index = {
|
||||
let mut builder = IndexBuilder::new();
|
||||
for (key, mut indexes) in self.indexed_words {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
builder.insert(key, indexes).unwrap();
|
||||
}
|
||||
builder.build()
|
||||
};
|
||||
|
||||
// WARN: removed documents must absolutely
|
||||
// be merged *before* document updates
|
||||
|
||||
// === index ===
|
||||
|
||||
if !removed_documents.is_empty() {
|
||||
// remove the documents using the appropriate IndexEvent
|
||||
let event_bytes = WriteIndexEvent::RemovedDocuments(&removed_documents).into_bytes();
|
||||
self.batch.merge(DATA_INDEX, &event_bytes)?;
|
||||
}
|
||||
|
||||
// update the documents using the appropriate IndexEvent
|
||||
let event_bytes = WriteIndexEvent::UpdatedDocuments(&index).into_bytes();
|
||||
self.batch.merge(DATA_INDEX, &event_bytes)?;
|
||||
|
||||
// === ranked map ===
|
||||
|
||||
if !removed_documents.is_empty() {
|
||||
// update the ranked map using the appropriate RankedMapEvent
|
||||
let event_bytes = WriteRankedMapEvent::RemovedDocuments(&removed_documents).into_bytes();
|
||||
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
|
||||
}
|
||||
|
||||
// update the documents using the appropriate IndexEvent
|
||||
let event_bytes = WriteRankedMapEvent::UpdatedDocuments(&self.documents_ranked_fields).into_bytes();
|
||||
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
|
||||
|
||||
Ok(self.batch)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentUpdate<'a> {
|
||||
document_id: DocumentId,
|
||||
inner: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a> DocumentUpdate<'a> {
|
||||
pub fn remove(&mut self) -> Result<(), SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) {
|
||||
return Err(SerializerError::custom(
|
||||
"This document has already been updated and cannot be removed in the same update"
|
||||
));
|
||||
}
|
||||
|
||||
let start = DocumentKey::new(self.document_id).with_attribute_min();
|
||||
let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1
|
||||
self.inner.batch.delete_range(start.as_ref(), end.as_ref())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||
return Err(SerializerError::custom(
|
||||
"This document has already been deleted and cannot be updated in the same update"
|
||||
));
|
||||
}
|
||||
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.inner.batch.put(key.as_ref(), &value)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> {
|
||||
use serde::ser::Error;
|
||||
|
||||
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||
return Err(SerializerError::custom(
|
||||
"This document has already been deleted and cannot be updated in the same update"
|
||||
));
|
||||
}
|
||||
|
||||
self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn register_ranked_attribute(
|
||||
&mut self,
|
||||
attr: SchemaAttr,
|
||||
number: Number,
|
||||
) -> Result<(), SerializerError>
|
||||
{
|
||||
use serde::ser::Error;
|
||||
|
||||
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
|
||||
return Err(SerializerError::custom(
|
||||
"This document has already been deleted, ranked attributes cannot be added in the same update"
|
||||
));
|
||||
}
|
||||
|
||||
self.inner.documents_ranked_fields.insert((self.document_id, attr), number);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
58
src/database/update/ranked_map_event.rs
Normal file
58
src/database/update/ranked_map_event.rs
Normal file
@ -0,0 +1,58 @@
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::database::RankedMap;
|
||||
use crate::data::DocIds;
|
||||
|
||||
pub enum WriteRankedMapEvent<'a> {
|
||||
RemovedDocuments(&'a DocIds),
|
||||
UpdatedDocuments(&'a RankedMap),
|
||||
}
|
||||
|
||||
impl<'a> WriteToBytes for WriteRankedMapEvent<'a> {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
match self {
|
||||
WriteRankedMapEvent::RemovedDocuments(doc_ids) => {
|
||||
let _ = bytes.write_u8(0);
|
||||
doc_ids.write_to_bytes(bytes);
|
||||
},
|
||||
WriteRankedMapEvent::UpdatedDocuments(ranked_map) => {
|
||||
let _ = bytes.write_u8(1);
|
||||
bincode::serialize_into(bytes, ranked_map).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ReadRankedMapEvent {
|
||||
RemovedDocuments(DocIds),
|
||||
UpdatedDocuments(RankedMap),
|
||||
}
|
||||
|
||||
impl ReadRankedMapEvent {
|
||||
pub fn updated_documents(self) -> Option<RankedMap> {
|
||||
use ReadRankedMapEvent::*;
|
||||
match self {
|
||||
RemovedDocuments(_) => None,
|
||||
UpdatedDocuments(ranked_map) => Some(ranked_map),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for ReadRankedMapEvent {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
|
||||
match cursor.read_u8()? {
|
||||
0 => DocIds::from_shared_data_cursor(cursor).map(ReadRankedMapEvent::RemovedDocuments),
|
||||
1 => {
|
||||
let ranked_map = bincode::deserialize_from(cursor)?;
|
||||
Ok(ReadRankedMapEvent::UpdatedDocuments(ranked_map))
|
||||
},
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
@ -1,168 +0,0 @@
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use rocksdb::rocksdb_options;
|
||||
use hashbrown::HashMap;
|
||||
use fst::map::Map;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
||||
use crate::database::{DATA_INDEX, DocumentKeyAttr};
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use super::Update;
|
||||
|
||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
type Value = Vec<u8>;
|
||||
|
||||
pub struct RawUpdateBuilder {
|
||||
sst_file: PathBuf,
|
||||
document_updates: BTreeMap<DocumentId, DocumentUpdate>,
|
||||
}
|
||||
|
||||
pub struct DocumentUpdate {
|
||||
cleared: bool,
|
||||
words_indexes: HashMap<Token, Vec<DocIndex>>,
|
||||
attributes: BTreeMap<SchemaAttr, Value>,
|
||||
}
|
||||
|
||||
impl DocumentUpdate {
|
||||
pub fn new() -> DocumentUpdate {
|
||||
DocumentUpdate {
|
||||
cleared: false,
|
||||
words_indexes: HashMap::new(),
|
||||
attributes: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self) {
|
||||
self.cleared = true;
|
||||
self.clear();
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.words_indexes.clear();
|
||||
self.attributes.clear();
|
||||
}
|
||||
|
||||
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: Vec<u8>) {
|
||||
self.attributes.insert(attr, value);
|
||||
}
|
||||
|
||||
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
|
||||
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
|
||||
}
|
||||
}
|
||||
|
||||
impl RawUpdateBuilder {
|
||||
pub fn new(path: PathBuf) -> RawUpdateBuilder {
|
||||
RawUpdateBuilder {
|
||||
sst_file: path,
|
||||
document_updates: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate {
|
||||
match self.document_updates.entry(document_id) {
|
||||
Entry::Occupied(mut occupied) => {
|
||||
occupied.get_mut().clear();
|
||||
occupied.into_mut()
|
||||
},
|
||||
Entry::Vacant(vacant) => vacant.insert(DocumentUpdate::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(mut self) -> Result<Update, Box<Error>> {
|
||||
let mut removed_document_ids = Vec::new();
|
||||
let mut words_indexes = BTreeMap::new();
|
||||
|
||||
for (&id, update) in self.document_updates.iter_mut() {
|
||||
if update.cleared { removed_document_ids.push(id) }
|
||||
|
||||
for (token, indexes) in &update.words_indexes {
|
||||
words_indexes.entry(token).or_insert_with(Vec::new).extend_from_slice(indexes)
|
||||
}
|
||||
}
|
||||
|
||||
let negative = {
|
||||
let removed_document_ids = Set::new_unchecked(&removed_document_ids);
|
||||
let doc_ids = DocIds::new(removed_document_ids);
|
||||
Negative::new(doc_ids)
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut positive_builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in words_indexes {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
positive_builder.insert(key, indexes)?;
|
||||
}
|
||||
|
||||
let (map, indexes) = positive_builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let index = Index { negative, positive };
|
||||
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
index.write_to_bytes(&mut bytes);
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents attributes updates
|
||||
for (id, update) in self.document_updates {
|
||||
|
||||
let mut last_attr: Option<SchemaAttr> = None;
|
||||
for (attr, value) in update.attributes {
|
||||
|
||||
if update.cleared {
|
||||
// if there is no last attribute, remove from the first attribute
|
||||
let start_attr = match last_attr {
|
||||
Some(attr) => attr.next(),
|
||||
None => Some(SchemaAttr::min())
|
||||
};
|
||||
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
|
||||
let end = attr.prev().map(|a| DocumentKeyAttr::new(id, a));
|
||||
|
||||
// delete_range between (last_attr + 1) and (attr - 1)
|
||||
if let (Some(start), Some(end)) = (start, end) {
|
||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
||||
}
|
||||
}
|
||||
|
||||
let key = DocumentKeyAttr::new(id, attr);
|
||||
file_writer.put(key.as_ref(), &value)?;
|
||||
last_attr = Some(attr);
|
||||
}
|
||||
|
||||
if update.cleared {
|
||||
// if there is no last attribute, remove from the first attribute
|
||||
let start_attr = match last_attr {
|
||||
Some(attr) => attr.next(),
|
||||
None => Some(SchemaAttr::min())
|
||||
};
|
||||
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
|
||||
let end = DocumentKeyAttr::with_attribute_max(id);
|
||||
|
||||
// delete_range between (last_attr + 1) and attr_max
|
||||
if let Some(start) = start {
|
||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
|
||||
Ok(Update { sst_file: self.sst_file })
|
||||
}
|
||||
}
|
@ -7,12 +7,14 @@ use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
|
||||
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
|
||||
use crate::database::serde::deserializer::Deserializer;
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::{retrieve_data_schema, retrieve_data_index};
|
||||
use crate::database::deserializer::Deserializer;
|
||||
use crate::rank::{QueryBuilder, FilterFunc};
|
||||
use crate::database::schema::Schema;
|
||||
use crate::database::index::Index;
|
||||
use crate::rank::{QueryBuilder, FilterFunc};
|
||||
use crate::database::RankedMap;
|
||||
use crate::database::Config;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct DatabaseView<D>
|
||||
@ -20,7 +22,9 @@ where D: Deref<Target=DB>
|
||||
{
|
||||
snapshot: Snapshot<D>,
|
||||
index: Index,
|
||||
ranked_map: RankedMap,
|
||||
schema: Schema,
|
||||
config: Config,
|
||||
}
|
||||
|
||||
impl<D> DatabaseView<D>
|
||||
@ -29,7 +33,9 @@ where D: Deref<Target=DB>
|
||||
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
|
||||
let schema = retrieve_data_schema(&snapshot)?;
|
||||
let index = retrieve_data_index(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, index, schema })
|
||||
let ranked_map = retrieve_data_ranked_map(&snapshot)?;
|
||||
let config = retrieve_config(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, index, ranked_map, schema, config })
|
||||
}
|
||||
|
||||
pub fn schema(&self) -> &Schema {
|
||||
@ -40,6 +46,10 @@ where D: Deref<Target=DB>
|
||||
&self.index
|
||||
}
|
||||
|
||||
pub fn ranked_map(&self) -> &RankedMap {
|
||||
&self.ranked_map
|
||||
}
|
||||
|
||||
pub fn into_snapshot(self) -> Snapshot<D> {
|
||||
self.snapshot
|
||||
}
|
||||
@ -48,6 +58,10 @@ where D: Deref<Target=DB>
|
||||
&self.snapshot
|
||||
}
|
||||
|
||||
pub fn config(&self) -> &Config {
|
||||
&self.config
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
|
||||
Ok(self.snapshot.get(key)?)
|
||||
}
|
||||
@ -71,12 +85,25 @@ where D: Deref<Target=DB>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
|
||||
QueryBuilder::new(self)
|
||||
pub fn query_builder(&self) -> QueryBuilder<FilterFunc> {
|
||||
QueryBuilder::new(self.index())
|
||||
}
|
||||
|
||||
pub fn raw_field_by_document_id(
|
||||
&self,
|
||||
name: &str,
|
||||
id: DocumentId
|
||||
) -> Result<Option<Vec<u8>>, Box<Error>>
|
||||
{
|
||||
let attr = self.schema.attribute(name).ok_or("field not found")?;
|
||||
let key = DocumentKeyAttr::new(id, attr);
|
||||
let vector = self.snapshot.get(key.as_ref())?;
|
||||
|
||||
Ok(vector.map(|v| v.to_vec()))
|
||||
}
|
||||
|
||||
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
|
||||
where T: DeserializeOwned
|
||||
where T: DeserializeOwned,
|
||||
{
|
||||
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
|
||||
Ok(T::deserialize(&mut deserializer)?)
|
||||
|
47
src/lib.rs
47
src/lib.rs
@ -5,21 +5,34 @@ pub mod database;
|
||||
pub mod data;
|
||||
pub mod rank;
|
||||
pub mod tokenizer;
|
||||
mod attribute;
|
||||
mod word_area;
|
||||
mod common_words;
|
||||
mod shared_data_cursor;
|
||||
mod write_to_bytes;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
|
||||
pub use rocksdb;
|
||||
|
||||
pub use self::tokenizer::Tokenizer;
|
||||
pub use self::common_words::CommonWords;
|
||||
pub use self::attribute::{Attribute, AttributeError};
|
||||
pub use self::word_area::{WordArea, WordAreaError};
|
||||
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub struct DocumentId(u64);
|
||||
|
||||
@ -36,14 +49,16 @@ pub struct DocIndex {
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: Attribute,
|
||||
pub attribute: u16,
|
||||
pub word_index: u32,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub word_area: WordArea,
|
||||
pub char_index: u32,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
/// This structure represent a matching word with informations
|
||||
@ -68,7 +83,8 @@ pub struct Match {
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: Attribute,
|
||||
pub attribute: u16,
|
||||
pub word_index: u32,
|
||||
|
||||
/// Whether the word that match is an exact match or a prefix.
|
||||
pub is_exact: bool,
|
||||
@ -78,7 +94,8 @@ pub struct Match {
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub word_area: WordArea,
|
||||
pub char_index: u32,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
@ -86,9 +103,11 @@ impl Match {
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
attribute: 0,
|
||||
word_index: 0,
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 0),
|
||||
char_index: 0,
|
||||
char_length: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@ -96,9 +115,11 @@ impl Match {
|
||||
Match {
|
||||
query_index: u32::max_value(),
|
||||
distance: u8::max_value(),
|
||||
attribute: Attribute::max_value(),
|
||||
attribute: u16::max_value(),
|
||||
word_index: u32::max_value(),
|
||||
is_exact: true,
|
||||
word_area: WordArea::max_value(),
|
||||
char_index: u32::max_value(),
|
||||
char_length: u16::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -110,6 +131,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn docindex_mem_size() {
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
||||
}
|
||||
}
|
||||
|
@ -1,19 +1,13 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::Document;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct DocumentId;
|
||||
|
||||
impl<D> Criterion<D> for DocumentId
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
impl Criterion for DocumentId {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
lhs.id.cmp(&rhs.id)
|
||||
}
|
||||
}
|
||||
|
@ -1,33 +1,40 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn contains_exact(matches: &&[Match]) -> bool {
|
||||
matches.iter().any(|m| m.is_exact)
|
||||
}
|
||||
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
||||
let mut count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(matches: &[Match]) -> usize {
|
||||
GroupBy::new(matches, match_query_index).filter(contains_exact).count()
|
||||
for group in query_index.linear_group() {
|
||||
let len = group.len();
|
||||
count += is_exact[index..index + len].contains(&true) as usize;
|
||||
index += len;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Exact;
|
||||
|
||||
impl<D> Criterion<D> for Exact
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = number_exact_matches(&lhs.matches);
|
||||
let rhs = number_exact_matches(&rhs.matches);
|
||||
impl Criterion for Exact {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let is_exact = lhs.is_exact();
|
||||
number_exact_matches(query_index, is_exact)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let is_exact = rhs.is_exact();
|
||||
number_exact_matches(query_index, is_exact)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
@ -4,16 +4,11 @@ mod words_proximity;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod exact;
|
||||
mod sort_by;
|
||||
mod sort_by_attr;
|
||||
mod document_id;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::Document;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
pub use self::{
|
||||
sum_of_typos::SumOfTypos,
|
||||
@ -22,60 +17,51 @@ pub use self::{
|
||||
sum_of_words_attribute::SumOfWordsAttribute,
|
||||
sum_of_words_position::SumOfWordsPosition,
|
||||
exact::Exact,
|
||||
sort_by::SortBy,
|
||||
sort_by_attr::SortByAttr,
|
||||
document_id::DocumentId,
|
||||
};
|
||||
|
||||
pub trait Criterion<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
|
||||
pub trait Criterion: Send + Sync {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||
|
||||
#[inline]
|
||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
||||
self.evaluate(lhs, rhs, view) == Ordering::Equal
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
self.evaluate(lhs, rhs) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D, T: Criterion<D> + ?Sized> Criterion<D> for &'a T
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs, view)
|
||||
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
||||
(**self).eq(lhs, rhs, view)
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl<D, T: Criterion<D> + ?Sized> Criterion<D> for Box<T>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs, view)
|
||||
impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
|
||||
(**self).eq(lhs, rhs, view)
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct CriteriaBuilder<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
inner: Vec<Box<dyn Criterion<D>>>
|
||||
pub struct CriteriaBuilder<'a> {
|
||||
inner: Vec<Box<dyn Criterion + 'a>>
|
||||
}
|
||||
|
||||
impl<D> CriteriaBuilder<D>
|
||||
where D: Deref<Target=DB>
|
||||
impl<'a> CriteriaBuilder<'a>
|
||||
{
|
||||
pub fn new() -> CriteriaBuilder<D> {
|
||||
pub fn new() -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder { inner: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<D> {
|
||||
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
||||
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
|
||||
}
|
||||
|
||||
@ -83,33 +69,29 @@ where D: Deref<Target=DB>
|
||||
self.inner.reserve(additional)
|
||||
}
|
||||
|
||||
pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder<D>
|
||||
where C: 'static + Criterion<D>,
|
||||
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
||||
where C: Criterion,
|
||||
{
|
||||
self.push(criterion);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push<C>(&mut self, criterion: C)
|
||||
where C: 'static + Criterion<D>,
|
||||
pub fn push<C: 'a>(&mut self, criterion: C)
|
||||
where C: Criterion,
|
||||
{
|
||||
self.inner.push(Box::new(criterion));
|
||||
}
|
||||
|
||||
pub fn build(self) -> Criteria<D> {
|
||||
pub fn build(self) -> Criteria<'a> {
|
||||
Criteria { inner: self.inner }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Criteria<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
inner: Vec<Box<dyn Criterion<D>>>,
|
||||
pub struct Criteria<'a> {
|
||||
inner: Vec<Box<dyn Criterion + 'a>>,
|
||||
}
|
||||
|
||||
impl<D> Default for Criteria<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
impl<'a> Default for Criteria<'a> {
|
||||
fn default() -> Self {
|
||||
CriteriaBuilder::with_capacity(7)
|
||||
.add(SumOfTypos)
|
||||
@ -123,10 +105,8 @@ where D: Deref<Target=DB>
|
||||
}
|
||||
}
|
||||
|
||||
impl<D> AsRef<[Box<dyn Criterion<D>>]> for Criteria<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn as_ref(&self) -> &[Box<dyn Criterion<D>>] {
|
||||
impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> {
|
||||
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
@ -1,28 +1,28 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
||||
GroupBy::new(matches, match_query_index).count()
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||
query_index.linear_group().count()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct NumberOfWords;
|
||||
|
||||
impl<D> Criterion<D> for NumberOfWords
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = number_of_query_words(&lhs.matches);
|
||||
let rhs = number_of_query_words(&rhs.matches);
|
||||
impl Criterion for NumberOfWords {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
@ -1,82 +0,0 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
use std::marker;
|
||||
|
||||
use rocksdb::DB;
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::Document;
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// If a document cannot be deserialized it will be considered [`None`][].
|
||||
///
|
||||
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
|
||||
/// so you must check the [`Ord`] of `Option` implementation.
|
||||
///
|
||||
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
|
||||
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```no-test
|
||||
/// use serde_derive::Deserialize;
|
||||
/// use meilidb::rank::criterion::*;
|
||||
///
|
||||
/// #[derive(Deserialize, PartialOrd, Ord, PartialEq, Eq)]
|
||||
/// struct TimeOnly {
|
||||
/// time: String,
|
||||
/// }
|
||||
///
|
||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||
/// .add(SumOfTypos)
|
||||
/// .add(NumberOfWords)
|
||||
/// .add(WordsProximity)
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(SortBy::<TimeOnly>::new())
|
||||
/// .add(DocumentId);
|
||||
///
|
||||
/// let criterion = builder.build();
|
||||
///
|
||||
/// ```
|
||||
pub struct SortBy<T> {
|
||||
_phantom: marker::PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T> SortBy<T> {
|
||||
pub fn new() -> Self {
|
||||
SortBy::default()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for SortBy<T> {
|
||||
fn default() -> SortBy<T> {
|
||||
SortBy { _phantom: marker::PhantomData }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, D> Criterion<D> for SortBy<T>
|
||||
where D: Deref<Target=DB>,
|
||||
T: DeserializeOwned + Ord,
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = match view.document_by_id::<T>(lhs.id) {
|
||||
Ok(doc) => Some(doc),
|
||||
Err(e) => { eprintln!("{}", e); None },
|
||||
};
|
||||
|
||||
let rhs = match view.document_by_id::<T>(rhs.id) {
|
||||
Ok(doc) => Some(doc),
|
||||
Err(e) => { eprintln!("{}", e); None },
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
122
src/rank/criterion/sort_by_attr.rs
Normal file
122
src/rank/criterion/sort_by_attr.rs
Normal file
@ -0,0 +1,122 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::RankedMap;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// If a document cannot be deserialized it will be considered [`None`][].
|
||||
///
|
||||
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
|
||||
/// so you must check the [`Ord`] of `Option` implementation.
|
||||
///
|
||||
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
|
||||
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// use serde_derive::Deserialize;
|
||||
/// use meilidb::rank::criterion::*;
|
||||
///
|
||||
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
|
||||
///
|
||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||
/// .add(SumOfTypos)
|
||||
/// .add(NumberOfWords)
|
||||
/// .add(WordsProximity)
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(custom_ranking)
|
||||
/// .add(DocumentId);
|
||||
///
|
||||
/// let criterion = builder.build();
|
||||
///
|
||||
/// ```
|
||||
pub struct SortByAttr<'a> {
|
||||
ranked_map: &'a RankedMap,
|
||||
attr: SchemaAttr,
|
||||
reversed: bool,
|
||||
}
|
||||
|
||||
impl<'a> SortByAttr<'a> {
|
||||
pub fn lower_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
SortByAttr::new(ranked_map, schema, attr_name, false)
|
||||
}
|
||||
|
||||
pub fn higher_is_better(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
SortByAttr::new(ranked_map, schema, attr_name, true)
|
||||
}
|
||||
|
||||
fn new(
|
||||
ranked_map: &'a RankedMap,
|
||||
schema: &Schema,
|
||||
attr_name: &str,
|
||||
reversed: bool,
|
||||
) -> Result<SortByAttr<'a>, SortByAttrError>
|
||||
{
|
||||
let attr = match schema.attribute(attr_name) {
|
||||
Some(attr) => attr,
|
||||
None => return Err(SortByAttrError::AttributeNotFound),
|
||||
};
|
||||
|
||||
if !schema.props(attr).is_ranked() {
|
||||
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
|
||||
}
|
||||
|
||||
Ok(SortByAttr { ranked_map, attr, reversed })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Criterion for SortByAttr<'a> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = self.ranked_map.get(&(lhs.id, self.attr));
|
||||
let rhs = self.ranked_map.get(&(rhs.id, self.attr));
|
||||
|
||||
match (lhs, rhs) {
|
||||
(Some(lhs), Some(rhs)) => {
|
||||
let order = lhs.cmp(&rhs);
|
||||
if self.reversed { order.reverse() } else { order }
|
||||
},
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum SortByAttrError {
|
||||
AttributeNotFound,
|
||||
AttributeNotRegisteredForRanking,
|
||||
}
|
||||
|
||||
impl fmt::Display for SortByAttrError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use SortByAttrError::*;
|
||||
match self {
|
||||
AttributeNotFound => f.write_str("attribute not found in the schema"),
|
||||
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SortByAttrError { }
|
@ -1,106 +1,79 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
||||
// the number of typos is never bigger than that.
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(matches: &[Match]) -> isize {
|
||||
let mut sum_typos = 0;
|
||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||
let mut number_words = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
let mut index = 0;
|
||||
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
for group in GroupBy::new(matches, match_query_index) {
|
||||
sum_typos += unsafe { group.get_unchecked(0).distance as isize };
|
||||
for group in query_index.linear_group() {
|
||||
sum_typos += custom_log10(distance[index]);
|
||||
number_words += 1;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_typos - number_words
|
||||
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfTypos;
|
||||
|
||||
impl<D> Criterion<D> for SumOfTypos
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = sum_matches_typos(&lhs.matches);
|
||||
let rhs = sum_matches_typos(&rhs.matches);
|
||||
impl Criterion for SumOfTypos {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::{DocumentId, Attribute, WordArea};
|
||||
|
||||
// typing: "Geox CEO"
|
||||
//
|
||||
// doc0: "Geox SpA: CEO and Executive"
|
||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||
#[test]
|
||||
fn one_typo_reference() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 2),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: DocumentId(0),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 1,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 2),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: DocumentId(1),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index1 = &[0, 1];
|
||||
let distance1 = &[1, 0];
|
||||
|
||||
let lhs = sum_matches_typos(&doc0.matches);
|
||||
let rhs = sum_matches_typos(&doc1.matches);
|
||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchette"
|
||||
@ -109,48 +82,15 @@ mod tests {
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn no_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 1),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: DocumentId(0),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: DocumentId(1),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let lhs = sum_matches_typos(&doc0.matches);
|
||||
let rhs = sum_matches_typos(&doc1.matches);
|
||||
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchztte"
|
||||
@ -159,47 +99,14 @@ mod tests {
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn one_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
Match {
|
||||
query_index: 1,
|
||||
distance: 1,
|
||||
attribute: Attribute::new_faillible(0, 1),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: DocumentId(0),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 1];
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: Attribute::new_faillible(0, 0),
|
||||
is_exact: false,
|
||||
word_area: WordArea::new_faillible(0, 6)
|
||||
},
|
||||
];
|
||||
Document {
|
||||
id: DocumentId(1),
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let lhs = sum_matches_typos(&doc0.matches);
|
||||
let rhs = sum_matches_typos(&doc1.matches);
|
||||
assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
||||
|
@ -1,32 +1,39 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(matches: &[Match]) -> usize {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| {
|
||||
unsafe { group.get_unchecked(0).attribute.attribute() as usize }
|
||||
}).sum()
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||
let mut sum_attributes = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_attributes += attribute[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_attributes
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsAttribute;
|
||||
|
||||
impl<D> Criterion<D> for SumOfWordsAttribute
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = sum_matches_attributes(&lhs.matches);
|
||||
let rhs = sum_matches_attributes(&rhs.matches);
|
||||
impl Criterion for SumOfWordsAttribute {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let attribute = lhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let attribute = rhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
@ -1,32 +1,39 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::database::DatabaseView;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(matches: &[Match]) -> usize {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| {
|
||||
unsafe { group.get_unchecked(0).attribute.word_index() as usize }
|
||||
}).sum()
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
|
||||
let mut sum_word_index = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_word_index += word_index[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_word_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsPosition;
|
||||
|
||||
impl<D> Criterion<D> for SumOfWordsPosition
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = sum_matches_attribute_index(&lhs.matches);
|
||||
let rhs = sum_matches_attribute_index(&rhs.matches);
|
||||
impl Criterion for SumOfWordsPosition {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let word_index = lhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let word_index = rhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
@ -1,16 +1,17 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use std::ops::Deref;
|
||||
|
||||
use rocksdb::DB;
|
||||
use group_by::GroupBy;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::{match_query_index, Document};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::Match;
|
||||
use crate::rank::RawDocument;
|
||||
|
||||
const MAX_DISTANCE: u32 = 8;
|
||||
|
||||
#[inline]
|
||||
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||
(a.clone(), b.clone())
|
||||
}
|
||||
|
||||
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
@ -19,30 +20,58 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
|
||||
if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
|
||||
index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
|
||||
fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
|
||||
if lattr != rattr { return MAX_DISTANCE }
|
||||
index_proximity(lwi, rwi)
|
||||
}
|
||||
|
||||
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
|
||||
fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
|
||||
let mut min_prox = u32::max_value();
|
||||
for a in lhs {
|
||||
for b in rhs {
|
||||
|
||||
for a in lattr.iter().zip(lwi) {
|
||||
for b in rattr.iter().zip(rwi) {
|
||||
let a = clone_tuple(a);
|
||||
let b = clone_tuple(b);
|
||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(matches: &[Match]) -> u32 {
|
||||
fn matches_proximity(
|
||||
query_index: &[u32],
|
||||
distance: &[u8],
|
||||
attribute: &[u16],
|
||||
word_index: &[u32],
|
||||
) -> u32
|
||||
{
|
||||
let mut query_index_groups = query_index.linear_group();
|
||||
let mut proximity = 0;
|
||||
let mut iter = GroupBy::new(matches, match_query_index);
|
||||
let mut index = 0;
|
||||
|
||||
// iterate over groups by windows of size 2
|
||||
let mut last = iter.next();
|
||||
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
||||
proximity += min_proximity(lhs, rhs);
|
||||
last = Some(rhs);
|
||||
let get_attr_wi = |index: usize, group_len: usize| {
|
||||
// retrieve the first distance group (with the lowest values)
|
||||
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
|
||||
(rattr, rwi)
|
||||
};
|
||||
|
||||
let mut last = query_index_groups.next().map(|group| {
|
||||
let attr_wi = get_attr_wi(index, group.len());
|
||||
index += group.len();
|
||||
attr_wi
|
||||
});
|
||||
|
||||
// iter by windows of size 2
|
||||
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
|
||||
let attr_wi = get_attr_wi(index, rhs.len());
|
||||
proximity += min_proximity(lhs, attr_wi);
|
||||
last = Some(attr_wi);
|
||||
index += rhs.len();
|
||||
}
|
||||
|
||||
proximity
|
||||
@ -51,24 +80,32 @@ fn matches_proximity(matches: &[Match]) -> u32 {
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WordsProximity;
|
||||
|
||||
impl<D> Criterion<D> for WordsProximity
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
|
||||
let lhs = matches_proximity(&lhs.matches);
|
||||
let rhs = matches_proximity(&rhs.matches);
|
||||
impl Criterion for WordsProximity {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
let attribute = lhs.attribute();
|
||||
let word_index = lhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
let attribute = rhs.attribute();
|
||||
let word_index = rhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::Attribute;
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
|
||||
@ -80,18 +117,15 @@ mod tests {
|
||||
// { id: 2, attr: 2, attr_index: 0 }
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
||||
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
|
||||
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
|
||||
];
|
||||
let query_index = &[0, 1, 2, 2, 3];
|
||||
let distance = &[0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(matches_proximity(matches), 17);
|
||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -106,57 +140,14 @@ mod tests {
|
||||
// { id: 3, attr: 0, attr_index: 1 }
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
|
||||
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
|
||||
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
|
||||
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
|
||||
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
|
||||
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
|
||||
];
|
||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||
let distance = &[0, 0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(matches_proximity(matches), 3);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "nightly", test))]
|
||||
mod bench {
|
||||
extern crate test;
|
||||
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
use self::test::Bencher;
|
||||
|
||||
use rand_xorshift::XorShiftRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
use crate::Attribute;
|
||||
|
||||
#[bench]
|
||||
fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box<Error>> {
|
||||
let number_matches = 30_000;
|
||||
let mut matches = Vec::with_capacity(number_matches);
|
||||
let mut rng = XorShiftRng::seed_from_u64(42);
|
||||
|
||||
for _ in 0..number_matches {
|
||||
let query_index = rng.gen_range(0, 4);
|
||||
|
||||
let attribute = rng.gen_range(0, 5);
|
||||
let word_index = rng.gen_range(0, 15);
|
||||
let attribute = Attribute::new_faillible(attribute, word_index);
|
||||
|
||||
let match_ = Match { query_index, attribute, ..Match::zero() };
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
bench.iter(|| {
|
||||
let proximity = matches_proximity(&matches);
|
||||
test::black_box(move || proximity)
|
||||
});
|
||||
|
||||
Ok(())
|
||||
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
|
||||
}
|
||||
}
|
||||
|
180
src/rank/mod.rs
180
src/rank/mod.rs
@ -2,32 +2,182 @@ pub mod criterion;
|
||||
mod query_builder;
|
||||
mod distinct_map;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
|
||||
use crate::{Match, DocumentId};
|
||||
|
||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||
|
||||
#[inline]
|
||||
fn match_query_index(a: &Match, b: &Match) -> bool {
|
||||
a.query_index == b.query_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Document {
|
||||
pub id: DocumentId,
|
||||
pub matches: Vec<Match>,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
pub fn new(doc: DocumentId, match_: Match) -> Self {
|
||||
unsafe { Self::from_sorted_matches(doc, vec![match_]) }
|
||||
}
|
||||
fn from_raw(raw: &RawDocument) -> Document {
|
||||
let len = raw.matches.range.len();
|
||||
let mut matches = Vec::with_capacity(len);
|
||||
|
||||
pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self {
|
||||
matches.sort_unstable();
|
||||
unsafe { Self::from_sorted_matches(doc, matches) }
|
||||
}
|
||||
let query_index = raw.query_index();
|
||||
let distance = raw.distance();
|
||||
let attribute = raw.attribute();
|
||||
let word_index = raw.word_index();
|
||||
let is_exact = raw.is_exact();
|
||||
let char_index = raw.char_index();
|
||||
let char_length = raw.char_length();
|
||||
|
||||
pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self {
|
||||
Self { id, matches }
|
||||
for i in 0..len {
|
||||
let match_ = Match {
|
||||
query_index: query_index[i],
|
||||
distance: distance[i],
|
||||
attribute: attribute[i],
|
||||
word_index: word_index[i],
|
||||
is_exact: is_exact[i],
|
||||
char_index: char_index[i],
|
||||
char_length: char_length[i],
|
||||
};
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
Document { id: raw.id, matches }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
pub id: DocumentId,
|
||||
pub matches: SharedMatches,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
|
||||
RawDocument { id, matches: SharedMatches { range, matches } }
|
||||
}
|
||||
|
||||
pub fn query_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn attribute(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn word_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn char_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
|
||||
pub fn char_length(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
|
||||
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
|
||||
matches.par_sort_unstable();
|
||||
|
||||
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
|
||||
let id = group[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
|
||||
let end = start + group.len();
|
||||
docs_ranges.push((id, Range { start, end }));
|
||||
|
||||
matches2.extend_from_slice(group);
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Range {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
impl Range {
|
||||
fn len(self) -> usize {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedMatches {
|
||||
range: Range,
|
||||
matches: Arc<Matches>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Matches {
|
||||
query_index: Vec<u32>,
|
||||
distance: Vec<u8>,
|
||||
attribute: Vec<u16>,
|
||||
word_index: Vec<u32>,
|
||||
is_exact: Vec<bool>,
|
||||
char_index: Vec<u32>,
|
||||
char_length: Vec<u16>,
|
||||
}
|
||||
|
||||
impl Matches {
|
||||
fn with_capacity(cap: usize) -> Matches {
|
||||
Matches {
|
||||
query_index: Vec::with_capacity(cap),
|
||||
distance: Vec::with_capacity(cap),
|
||||
attribute: Vec::with_capacity(cap),
|
||||
word_index: Vec::with_capacity(cap),
|
||||
is_exact: Vec::with_capacity(cap),
|
||||
char_index: Vec::with_capacity(cap),
|
||||
char_length: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
|
||||
for (_, match_) in matches {
|
||||
self.query_index.push(match_.query_index);
|
||||
self.distance.push(match_.distance);
|
||||
self.attribute.push(match_.attribute);
|
||||
self.word_index.push(match_.word_index);
|
||||
self.is_exact.push(match_.is_exact);
|
||||
self.char_index.push(match_.char_index);
|
||||
self.char_length.push(match_.char_length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,30 +1,56 @@
|
||||
use std::{cmp, mem, vec, str, char};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::error::Error;
|
||||
use std::{cmp, mem};
|
||||
use std::ops::Range;
|
||||
use std::time::Instant;
|
||||
use std::hash::Hash;
|
||||
use std::rc::Rc;
|
||||
|
||||
use group_by::BinaryGroupByMut;
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
use slice_group_by::{GroupByMut, LinearStrGroupBy};
|
||||
use hashbrown::HashMap;
|
||||
use fst::Streamer;
|
||||
use rocksdb::DB;
|
||||
use log::info;
|
||||
|
||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
use crate::rank::criterion::Criteria;
|
||||
use crate::database::DatabaseView;
|
||||
use crate::{Match, DocumentId};
|
||||
use crate::rank::Document;
|
||||
use crate::database::Index;
|
||||
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
||||
use crate::{is_cjk, Match, DocumentId};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum CharCategory {
|
||||
Space,
|
||||
Cjk,
|
||||
Other,
|
||||
}
|
||||
|
||||
fn classify_char(c: char) -> CharCategory {
|
||||
if c.is_whitespace() { CharCategory::Space }
|
||||
else if is_cjk(c) { CharCategory::Cjk }
|
||||
else { CharCategory::Other }
|
||||
}
|
||||
|
||||
fn is_word(s: &&str) -> bool {
|
||||
!s.chars().any(char::is_whitespace)
|
||||
}
|
||||
|
||||
fn same_group_category(a: char, b: char) -> bool {
|
||||
let ca = classify_char(a);
|
||||
let cb = classify_char(b);
|
||||
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
|
||||
}
|
||||
|
||||
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let mut automatons = Vec::new();
|
||||
let mut words = query.split_whitespace().map(str::to_lowercase).peekable();
|
||||
let mut groups = LinearStrGroupBy::new(query, same_group_category)
|
||||
.filter(is_word)
|
||||
.map(str::to_lowercase)
|
||||
.peekable();
|
||||
|
||||
while let Some(word) = words.next() {
|
||||
let has_following_word = words.peek().is_some();
|
||||
let lev = if has_following_word || has_end_whitespace {
|
||||
let mut automatons = Vec::new();
|
||||
while let Some(word) = groups.next() {
|
||||
let has_following_word = groups.peek().is_some();
|
||||
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
|
||||
automaton::build_dfa(&word)
|
||||
} else {
|
||||
automaton::build_prefix_dfa(&word)
|
||||
@ -35,43 +61,38 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
|
||||
automatons
|
||||
}
|
||||
|
||||
pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
|
||||
pub type FilterFunc = fn(DocumentId) -> bool;
|
||||
|
||||
pub struct QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
view: &'a DatabaseView<D>,
|
||||
criteria: Criteria<D>,
|
||||
pub struct QueryBuilder<'i, 'c, FI> {
|
||||
index: &'i Index,
|
||||
criteria: Criteria<'c>,
|
||||
filter: Option<FI>,
|
||||
}
|
||||
|
||||
impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
|
||||
QueryBuilder::with_criteria(view, Criteria::default())
|
||||
impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> {
|
||||
pub fn new(index: &'i Index) -> Self {
|
||||
QueryBuilder::with_criteria(index, Criteria::default())
|
||||
}
|
||||
|
||||
pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self {
|
||||
QueryBuilder { index, criteria, filter: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>,
|
||||
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
|
||||
{
|
||||
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> {
|
||||
Ok(QueryBuilder { view, criteria, filter: None })
|
||||
}
|
||||
|
||||
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F>
|
||||
where F: Fn(DocumentId) -> bool,
|
||||
{
|
||||
QueryBuilder {
|
||||
view: self.view,
|
||||
index: self.index,
|
||||
criteria: self.criteria,
|
||||
filter: Some(function)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F>
|
||||
where F: Fn(DocumentId) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
@ -81,19 +102,19 @@ where D: Deref<Target=DB>,
|
||||
}
|
||||
}
|
||||
|
||||
fn query_all(&self, query: &str) -> Vec<Document> {
|
||||
fn query_all(&self, query: &str) -> Vec<RawDocument> {
|
||||
let automatons = split_whitespace_automatons(query);
|
||||
|
||||
let mut stream = {
|
||||
let mut op_builder = fst::map::OpBuilder::new();
|
||||
for automaton in &automatons {
|
||||
let stream = self.view.index().positive.map().search(automaton);
|
||||
let stream = self.index.map.search(automaton);
|
||||
op_builder.push(stream);
|
||||
}
|
||||
op_builder.union()
|
||||
};
|
||||
|
||||
let mut matches = HashMap::new();
|
||||
let mut matches = Vec::new();
|
||||
|
||||
while let Some((input, indexed_values)) = stream.next() {
|
||||
for iv in indexed_values {
|
||||
@ -101,7 +122,7 @@ where D: Deref<Target=DB>,
|
||||
let distance = automaton.eval(input).to_u8();
|
||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||
|
||||
let doc_indexes = &self.view.index().positive.indexes();
|
||||
let doc_indexes = &self.index.indexes;
|
||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||
|
||||
for doc_index in doc_indexes {
|
||||
@ -109,41 +130,50 @@ where D: Deref<Target=DB>,
|
||||
query_index: iv.index as u32,
|
||||
distance: distance,
|
||||
attribute: doc_index.attribute,
|
||||
word_index: doc_index.word_index,
|
||||
is_exact: is_exact,
|
||||
word_area: doc_index.word_area,
|
||||
char_index: doc_index.char_index,
|
||||
char_length: doc_index.char_length,
|
||||
};
|
||||
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
|
||||
matches.push((doc_index.document_id, match_));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("{} documents to classify", matches.len());
|
||||
let total_matches = matches.len();
|
||||
let raw_documents = raw_documents_from_matches(matches);
|
||||
|
||||
matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
|
||||
info!("{} total documents to classify", raw_documents.len());
|
||||
info!("{} total matches to classify", total_matches);
|
||||
|
||||
raw_documents
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D, FI> QueryBuilder<'a, D, FI>
|
||||
where D: Deref<Target=DB>,
|
||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
|
||||
where FI: Fn(DocumentId) -> bool,
|
||||
{
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
// We give the filtering work to the query distinct builder,
|
||||
// We delegate the filter work to the distinct query builder,
|
||||
// specifying a distinct rule that has no effect.
|
||||
if self.filter.is_some() {
|
||||
let builder = self.with_distinct(|_, _| None as Option<()>, 1);
|
||||
let builder = self.with_distinct(|_| None as Option<()>, 1);
|
||||
return builder.query(query, range);
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let mut documents = self.query_all(query);
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
let view = &self.view;
|
||||
info!("query_all took {:.2?}", start.elapsed());
|
||||
|
||||
'criteria: for criterion in self.criteria.as_ref() {
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
|
||||
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < range.start {
|
||||
@ -152,9 +182,11 @@ where D: Deref<Target=DB>,
|
||||
continue;
|
||||
}
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
||||
let start = Instant::now();
|
||||
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
|
||||
|
||||
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
@ -165,28 +197,22 @@ where D: Deref<Target=DB>,
|
||||
}
|
||||
}
|
||||
|
||||
// `drain` removes the documents efficiently using `ptr::copy`
|
||||
// TODO it could be more efficient to have a custom iterator
|
||||
let offset = cmp::min(documents.len(), range.start);
|
||||
documents.drain(0..offset);
|
||||
documents.truncate(range.len());
|
||||
documents
|
||||
let iter = documents.into_iter().skip(offset).take(range.len());
|
||||
iter.map(|d| Document::from_raw(&d)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
inner: QueryBuilder<'a, D, FI>,
|
||||
pub struct DistinctQueryBuilder<'i, 'c, FI, FD> {
|
||||
inner: QueryBuilder<'i, 'c, FI>,
|
||||
function: FD,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>,
|
||||
impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD>
|
||||
{
|
||||
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
|
||||
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD>
|
||||
where F: Fn(DocumentId) -> bool,
|
||||
{
|
||||
DistinctQueryBuilder {
|
||||
inner: self.inner.with_filter(function),
|
||||
@ -196,17 +222,18 @@ where D: Deref<Target=DB>,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
|
||||
where D: Deref<Target=DB>,
|
||||
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
|
||||
FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
|
||||
impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD>
|
||||
where FI: Fn(DocumentId) -> bool,
|
||||
FD: Fn(DocumentId) -> Option<K>,
|
||||
K: Hash + Eq,
|
||||
{
|
||||
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
|
||||
let start = Instant::now();
|
||||
let mut documents = self.inner.query_all(query);
|
||||
info!("query_all took {:.2?}", start.elapsed());
|
||||
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
let mut key_cache = HashMap::new();
|
||||
let view = &self.inner.view;
|
||||
|
||||
let mut filter_map = HashMap::new();
|
||||
// these two variables informs on the current distinct map and
|
||||
@ -215,12 +242,14 @@ where D: Deref<Target=DB>,
|
||||
let mut distinct_map = DistinctMap::new(self.size);
|
||||
let mut distinct_raw_offset = 0;
|
||||
|
||||
'criteria: for criterion in self.inner.criteria.as_ref() {
|
||||
'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
info!("criterion {}, documents group of size {}", ci, group.len());
|
||||
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < distinct_raw_offset {
|
||||
@ -229,22 +258,24 @@ where D: Deref<Target=DB>,
|
||||
continue;
|
||||
}
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
|
||||
let start = Instant::now();
|
||||
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
|
||||
|
||||
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
// we must compute the real distinguished len of this sub-group
|
||||
for document in group.iter() {
|
||||
let filter_accepted = match &self.inner.filter {
|
||||
Some(filter) => {
|
||||
let entry = filter_map.entry(document.id);
|
||||
*entry.or_insert_with(|| (filter)(document.id, view))
|
||||
*entry.or_insert_with(|| (filter)(document.id))
|
||||
},
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
|
||||
let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
@ -290,7 +321,7 @@ where D: Deref<Target=DB>,
|
||||
};
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
out_documents.push(document);
|
||||
out_documents.push(Document::from_raw(&document));
|
||||
if out_documents.len() == range.len() { break }
|
||||
}
|
||||
}
|
||||
|
56
src/shared_data_cursor.rs
Normal file
56
src/shared_data_cursor.rs
Normal file
@ -0,0 +1,56 @@
|
||||
use std::io::{self, Read, Cursor, BufRead};
|
||||
use std::sync::Arc;
|
||||
use crate::data::SharedData;
|
||||
|
||||
pub struct SharedDataCursor(Cursor<SharedData>);
|
||||
|
||||
impl SharedDataCursor {
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor {
|
||||
let len = bytes.len();
|
||||
let bytes = Arc::new(bytes);
|
||||
|
||||
SharedDataCursor::from_shared_bytes(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor {
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let cursor = Cursor::new(data);
|
||||
|
||||
SharedDataCursor(cursor)
|
||||
}
|
||||
|
||||
pub fn extract(&mut self, amt: usize) -> SharedData {
|
||||
let offset = self.0.position() as usize;
|
||||
let extracted = self.0.get_ref().range(offset, amt);
|
||||
self.0.consume(amt);
|
||||
|
||||
extracted
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for SharedDataCursor {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.0.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
impl BufRead for SharedDataCursor {
|
||||
fn fill_buf(&mut self) -> io::Result<&[u8]> {
|
||||
self.0.fill_buf()
|
||||
}
|
||||
|
||||
fn consume(&mut self, amt: usize) {
|
||||
self.0.consume(amt)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait FromSharedDataCursor: Sized {
|
||||
type Error;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error>;
|
||||
|
||||
fn from_bytes(bytes: Vec<u8>) -> Result<Self, Self::Error> {
|
||||
let mut cursor = SharedDataCursor::from_bytes(bytes);
|
||||
Self::from_shared_data_cursor(&mut cursor)
|
||||
}
|
||||
}
|
@ -1,4 +1,5 @@
|
||||
use std::mem;
|
||||
use crate::is_cjk;
|
||||
use self::Separator::*;
|
||||
|
||||
pub trait TokenizerBuilder {
|
||||
@ -75,9 +76,9 @@ impl Separator {
|
||||
|
||||
fn detect_separator(c: char) -> Option<Separator> {
|
||||
match c {
|
||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
||||
' ' | '\'' | '"' => Some(Short),
|
||||
_ => None,
|
||||
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long),
|
||||
' ' | '\'' | '"' => Some(Short),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@ -109,9 +110,58 @@ impl<'a> Iterator for Tokenizer<'a> {
|
||||
return Some(token)
|
||||
}
|
||||
|
||||
distance.replace(distance.map_or(sep, |s| s.add(sep)));
|
||||
distance = Some(distance.map_or(sep, |s| s.add(sep)));
|
||||
},
|
||||
None => {
|
||||
// if this is a Chinese, a Japanese or a Korean character
|
||||
// See <http://unicode-table.com>
|
||||
if is_cjk(c) {
|
||||
match start_word {
|
||||
Some(start_word) => {
|
||||
let (prefix, tail) = self.inner.split_at(i);
|
||||
let (spaces, word) = prefix.split_at(start_word);
|
||||
|
||||
self.inner = tail;
|
||||
self.char_index += spaces.chars().count();
|
||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index,
|
||||
char_index: self.char_index,
|
||||
};
|
||||
|
||||
self.word_index += 1;
|
||||
self.char_index += word.chars().count();
|
||||
|
||||
return Some(token)
|
||||
},
|
||||
None => {
|
||||
let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
|
||||
let (spaces, word) = prefix.split_at(i);
|
||||
|
||||
self.inner = tail;
|
||||
self.char_index += spaces.chars().count();
|
||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let token = Token {
|
||||
word: word,
|
||||
word_index: self.word_index,
|
||||
char_index: self.char_index,
|
||||
};
|
||||
|
||||
if tail.chars().next().and_then(detect_separator).is_none() {
|
||||
self.word_index += 1;
|
||||
}
|
||||
self.char_index += 1;
|
||||
|
||||
return Some(token)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if start_word.is_none() { start_word = Some(i) }
|
||||
},
|
||||
None => { start_word.get_or_insert(i); },
|
||||
}
|
||||
}
|
||||
|
||||
@ -150,11 +200,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn hard() {
|
||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
|
||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||
@ -185,4 +236,24 @@ mod tests {
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard_kanjis() {
|
||||
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
|
||||
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
||||
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
|
||||
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
|
||||
assert_eq!(tokenizer.next(), None);
|
||||
}
|
||||
}
|
||||
|
102
src/word_area.rs
102
src/word_area.rs
@ -1,102 +0,0 @@
|
||||
use std::fmt;
|
||||
|
||||
/// Represent a word position in bytes along with the length of it.
|
||||
///
|
||||
/// It can represent words byte index to maximum 2^22 and
|
||||
/// up to words of length 1024.
|
||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct WordArea(u32);
|
||||
|
||||
impl WordArea {
|
||||
/// Construct a `WordArea` from a word position in expresed as
|
||||
/// a number of characters and the length of it.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The char index must not be greater than 2^22
|
||||
/// and the length not greater than 1024.
|
||||
pub(crate) fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
|
||||
if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
||||
return Err(WordAreaError::ByteIndexTooBig)
|
||||
}
|
||||
|
||||
if length & 0b1111_1100_0000_0000 != 0 {
|
||||
return Err(WordAreaError::LengthTooBig)
|
||||
}
|
||||
|
||||
let char_index = char_index << 10;
|
||||
Ok(WordArea(char_index | u32::from(length)))
|
||||
}
|
||||
|
||||
pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea {
|
||||
match WordArea::new(char_index, length) {
|
||||
Ok(word_area) => word_area,
|
||||
Err(WordAreaError::ByteIndexTooBig) => {
|
||||
panic!("word area byte index must not be greater than 2^22")
|
||||
},
|
||||
Err(WordAreaError::LengthTooBig) => {
|
||||
panic!("word area length must not be greater than 1024")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn max_value() -> WordArea {
|
||||
WordArea(u32::max_value())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn char_index(self) -> u32 {
|
||||
self.0 >> 10
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn length(self) -> u16 {
|
||||
(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for WordArea {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("WordArea")
|
||||
.field("char_index", &self.char_index())
|
||||
.field("length", &self.length())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WordAreaError {
|
||||
ByteIndexTooBig,
|
||||
LengthTooBig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use quickcheck::{quickcheck, TestResult};
|
||||
|
||||
quickcheck! {
|
||||
fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult {
|
||||
if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let word_area = WordArea::new_faillible(gen_char_index, gen_length);
|
||||
|
||||
let valid_char_index = word_area.char_index() == gen_char_index;
|
||||
let valid_length = word_area.length() == gen_length;
|
||||
|
||||
TestResult::from_bool(valid_char_index && valid_length)
|
||||
}
|
||||
|
||||
fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult {
|
||||
if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
|
||||
return TestResult::discard()
|
||||
}
|
||||
|
||||
let a = WordArea::new_faillible(gen_char_index, gen_length);
|
||||
let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1);
|
||||
|
||||
TestResult::from_bool(a < b)
|
||||
}
|
||||
}
|
||||
}
|
9
src/write_to_bytes.rs
Normal file
9
src/write_to_bytes.rs
Normal file
@ -0,0 +1,9 @@
|
||||
pub trait WriteToBytes {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>);
|
||||
|
||||
fn into_bytes(&self) -> Vec<u8> {
|
||||
let mut bytes = Vec::new();
|
||||
self.write_to_bytes(&mut bytes);
|
||||
bytes
|
||||
}
|
||||
}
|
59
typos-ranking-rules.md
Normal file
59
typos-ranking-rules.md
Normal file
@ -0,0 +1,59 @@
|
||||
# Typo and Ranking rules
|
||||
|
||||
This is an explanation of the default rules used in MeiliDB.
|
||||
|
||||
First we have to explain some terms that are used in this reading.
|
||||
|
||||
- A query string is the full list of all the words that the end user is searching for results.
|
||||
- A query word is one of the words that compose the query string.
|
||||
|
||||
|
||||
|
||||
## Typo rules
|
||||
|
||||
The typo rules are used before sorting the documents. They are used to aggregate them, to choose which documents contain words similar to the queried words.
|
||||
|
||||
We use a prefix _Levenshtein_ algorithm to check if the words match. The only difference with a Levenshtein algorithm is that it accepts every word that **starts with the query words** too. Therefore words are accepted if they start with or have the equal length.
|
||||
|
||||
|
||||
|
||||
The Levenshtein distance between two words _M_ and _P_ is called "the minimum cost of transforming _M_ into _P_" by performing the following elementary operations:
|
||||
|
||||
- substitution of a character of _M_ by a character other than _P_. (e.g. **k**itten → **s**itten)
|
||||
- insertion in _M_ of a character of _P_. (e.g. sittin → sittin**g**)
|
||||
- deleting a character from _M_. (e.g. satu**r**day → satuday)
|
||||
|
||||
|
||||
|
||||
There are some rules about what can be considered "similar". These rules are **by word** and not for the whole query string.
|
||||
|
||||
- If the query word is between 1 and 4 characters long therefore **no** typo is allowed, only documents that contains words that start or are exactly equal to this query word are considered valid for this request.
|
||||
- If the query word is between 5 and 8 characters long, **one** typo is allowed. Documents that contains words that match with one typo are retained for the next steps.
|
||||
- If the query word contains more than 8 characters, we accept a maximum of **two** typos.
|
||||
|
||||
|
||||
|
||||
This means that "satuday", which is 7 characters long, use the second rule and every document containing words that have only **one** typo will match. For example:
|
||||
|
||||
- "satuday" is accepted because it is exactly the same word.
|
||||
- "sat" is not accepted because the query word is not a prefix of it but the opposite.
|
||||
- "satu**r**day" is accepted because it contains **one** typo.
|
||||
- "s**u**tu**r**day" is not accepted because it contains **two** typos.
|
||||
|
||||
|
||||
|
||||
## Ranking rules
|
||||
|
||||
All documents that have been aggregated using the typo rules above can now be sorted. MeiliDB uses a bucket sort.
|
||||
|
||||
What is a bucket sort? We sort all the documents with the first rule, for all documents that can't be separated we create a group and sort it using the second rule, and so on.
|
||||
|
||||
Here is the list of all the default rules that are executed in this specific order by default:
|
||||
|
||||
- _Number of Typos_ - The less typos there are beween the query words and the document words, the better is the document.
|
||||
- _Number of Words_ - A document containing more of the query words will be more important than one that contains less.
|
||||
- _Words Proximity_ - The closer the query words are in the document the better is the document.
|
||||
- _Attribute_ - A document containing the query words in a more important attribute than another document is considered better.
|
||||
- _Position_ - A document containing the query words at the start of an attribute is considered better than a document that contains them at the end.
|
||||
- _Exact_ - A document containing the query words in their exact form, not only a prefix of them, is considered better.
|
||||
|
Reference in New Issue
Block a user