Compare commits

...

18 Commits

Author SHA1 Message Date
daeb226a3f First probably working phrase query doc filtering 2019-12-09 15:30:14 +01:00
2eeae7cfdc Fix the processed distance algorithm 2019-12-08 12:33:59 +01:00
ad958d38e8 squash-me: Bad Typo detection
I have an issue where "speakers" is split into "speaker" and "s",
when I compute the distances for the Typo criterion,
it takes "s" into account and put a distance of zero in the bucket 0
(the "speakers" bucket), therefore it reports any document matching "s"
without typos as best results.

I need to make sure to ignore "s" when its associated part "speaker"
doesn't even exist in the document and is not in the place
it should be ("speaker" followed by "s").

This is hard to think that it will had much computation time to
the Typo criterion like in the previous algorithm where I computed
the real query/words indexes based and removed the invalid ones
before sending the documents to the bucket sort.
2019-12-06 19:18:17 +01:00
96c3b98e68 Make the Typo and Words work with synonyms 2019-12-06 13:41:22 +01:00
2a7b34787b Improve the QueryEnhancer by doing a single lookup 2019-12-06 12:10:28 +01:00
60c4292172 squash-me: It seems like we support synonyms, split and concat words 2019-12-05 19:26:10 +01:00
00174d9165 squash-me: Added support for the the QueryEnhancer 2019-12-05 17:45:49 +01:00
e7654ffa1e squash-me: Improve the higlighted area 2019-12-05 14:35:38 +01:00
8a17a8d949 squash-me 2019-12-05 12:11:43 +01:00
f21e0bffe2 squash-me 2019-12-04 17:58:02 +01:00
7361dba079 squash-me: Add debug 2019-12-01 23:00:23 +01:00
5bc18fa704 squash-me: Add debug 2019-12-01 20:23:19 +01:00
3c8e4a3884 Make example support stdin using - 2019-11-30 16:53:34 +01:00
6e808e4b8f Add more debug infos 2019-11-30 16:33:48 +01:00
ee3a3cedf3 Before improving fields AttrCount
Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs

ugly-test: Make the fields_count fetching lazy

Just before running the exactness criterion
2019-11-30 10:28:29 +01:00
c4320b8b14 Introduce the AttrCount type 2019-11-30 10:21:00 +01:00
2c5da9aa11 sqaush-me: Improve benchmarks naming 2019-11-29 12:28:46 +01:00
eeb01c749c Add some criterion benchmarks to help mesure improvements 2019-11-29 12:12:55 +01:00
18 changed files with 1577 additions and 555 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
/target
meilisearch-core/target
**/*.csv
**/*.json_lines
**/*.rs.bk

92
Cargo.lock generated
View File

@ -196,6 +196,14 @@ dependencies = [
"ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cast"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cc"
version = "1.0.47"
@ -249,6 +257,11 @@ dependencies = [
"bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "compact_arena"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "const-random"
version = "0.1.6"
@ -284,6 +297,39 @@ dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "criterion"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)",
"cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
"criterion-plot 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_os 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_xoshiro 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
"tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "criterion-plot"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-channel"
version = "0.4.0"
@ -760,6 +806,14 @@ dependencies = [
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "itertools"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "itoa"
version = "0.4.4"
@ -887,6 +941,8 @@ dependencies = [
"bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)",
"compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"deunicode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -895,6 +951,8 @@ dependencies = [
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
"heed 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"meilisearch-schema 0.8.2",
@ -1439,6 +1497,15 @@ dependencies = [
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand_os"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"getrandom 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand_pcg"
version = "0.1.2"
@ -1456,6 +1523,14 @@ dependencies = [
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand_xoshiro"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rayon"
version = "1.2.0"
@ -2044,6 +2119,15 @@ dependencies = [
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "tinytemplate"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "tokio"
version = "0.1.22"
@ -2564,16 +2648,20 @@ dependencies = [
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
"checksum bytes 0.4.12 (registry+https://github.com/rust-lang/crates.io-index)" = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c"
"checksum c2-chacha 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "214238caa1bf3a496ec3392968969cab8549f96ff30652c9e56885329315f6bb"
"checksum cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
"checksum cc 1.0.47 (registry+https://github.com/rust-lang/crates.io-index)" = "aa87058dce70a3ff5621797f1506cb837edd02ac4c0ae642b4542dce802908b8"
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
"checksum chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e8493056968583b0193c1bb04d6f7684586f3726992d6c573261941a895dbd68"
"checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c"
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141"
"checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe"
"checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59"
"checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5"
"checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
"checksum criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "938703e165481c8d612ea3479ac8342e5615185db37765162e762ec3523e2fc6"
"checksum criterion-plot 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "eccdc6ce8bbe352ca89025bee672aa6d24f4eb8c53e3a8b5d1bc58011da072a2"
"checksum crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "acec9a3b0b3559f15aee4f90746c4e5e293b701c0f7d3925d24e01645267b68c"
"checksum crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3aa945d63861bfe624b55d153a39684da1e8c0bc8fba932f7ee3a3c16cea3ca"
"checksum crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5064ebdbf05ce3cb95e45c8b086f72263f4166b29b97f6baff7ef7fe047b55ac"
@ -2627,6 +2715,7 @@ dependencies = [
"checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
"checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
"checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
"checksum jemalloc-sys 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45"
"checksum jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69"
@ -2694,8 +2783,10 @@ dependencies = [
"checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08"
"checksum rand_jitter 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b"
"checksum rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071"
"checksum rand_os 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a788ae3edb696cfcba1c19bfd388cc4b8c21f8a408432b199c072825084da58a"
"checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
"checksum rand_xoshiro 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0e18c91676f670f6f0312764c759405f13afb98d5d73819840cf72a518487bff"
"checksum rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "83a27732a533a1be0a0035a111fe76db89ad312f6f0347004c220c57f209a123"
"checksum rayon-core 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "98dcf634205083b17d0861252431eb2acbfb698ab7478a2d20de07954f47ec7b"
"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
@ -2760,6 +2851,7 @@ dependencies = [
"checksum tide-querystring 0.1.0 (git+https://github.com/rustasync/tide?rev=e77709370bb24cf776fe6da902467c35131535b1)" = "<none>"
"checksum tide-slog 0.1.0 (git+https://github.com/rustasync/tide?rev=e77709370bb24cf776fe6da902467c35131535b1)" = "<none>"
"checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f"
"checksum tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4574b75faccaacddb9b284faecdf0b544b80b6b294f3d062d325c5726a209c20"
"checksum tokio 0.1.22 (registry+https://github.com/rust-lang/crates.io-index)" = "5a09c0b5bb588872ab2f09afa13ee6e9dac11e10a0ec9e8e3ba39a5a5d530af6"
"checksum tokio-buf 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8fb220f46c53859a4b7ec083e41dec9778ff0b1851c0942b211edb89e0ccdc46"
"checksum tokio-current-thread 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "d16217cad7f1b840c5a97dfb3c43b0c871fef423a6e8d2118c604e843662a443"

View File

@ -9,12 +9,14 @@ arc-swap = "0.4.3"
bincode = "1.1.4"
byteorder = "1.3.2"
chrono = { version = "0.4.9", features = ["serde"] }
compact_arena = "0.4.0"
crossbeam-channel = "0.4.0"
deunicode = "1.0.0"
env_logger = "0.7.0"
fst = { version = "0.3.5", default-features = false }
hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.6.0"
itertools = "0.8.2" # kill me please
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.8"
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.2" }
@ -31,10 +33,16 @@ zerocopy = "0.2.8"
[dev-dependencies]
assert_matches = "1.3"
criterion = "0.3"
csv = "1.0.7"
indexmap = { version = "1.2.0", features = ["serde-1"] }
jemallocator = "0.3.2"
rustyline = { version = "5.0.0", default-features = false }
structopt = "0.3.2"
tempfile = "3.1.0"
termcolor = "1.0.4"
toml = "0.5.3"
[[bench]]
name = "search_benchmark"
harness = false

View File

@ -0,0 +1,95 @@
#[cfg(test)]
#[macro_use]
extern crate assert_matches;
use std::sync::mpsc;
use std::path::Path;
use std::fs;
use std::iter;
use meilisearch_core::Database;
use meilisearch_core::{ProcessedUpdateResult, UpdateStatus};
use serde_json::Value;
use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
fn prepare_database(path: &Path) -> Database {
let database = Database::open_or_create(path).unwrap();
let db = &database;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("bench").unwrap();
database.set_update_callback(Box::new(update_fn));
let schema = {
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/schema.toml");
let string = fs::read_to_string(path).expect("find schema");
toml::from_str(&string).unwrap()
};
let mut update_writer = db.update_write_txn().unwrap();
let _update_id = index.schema_update(&mut update_writer, schema).unwrap();
update_writer.commit().unwrap();
let mut additions = index.documents_addition();
let json: Value = {
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/movies.json");
let movies_file = fs::File::open(path).expect("find movies");
serde_json::from_reader(movies_file).unwrap()
};
let documents = json.as_array().unwrap();
for document in documents {
additions.update_document(document);
}
let mut update_writer = db.update_write_txn().unwrap();
let update_id = additions.finalize(&mut update_writer).unwrap();
update_writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let update_reader = db.update_read_txn().unwrap();
let result = index.update_status(&update_reader, update_id).unwrap();
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
database
}
pub fn criterion_benchmark(c: &mut Criterion) {
let dir = tempfile::tempdir().unwrap();
let database = prepare_database(dir.path());
let reader = database.main_read_txn().unwrap();
let index = database.open_index("bench").unwrap();
let mut count = 0;
let query = "I love paris ";
let iter = iter::from_fn(|| {
count += 1;
query.get(0..count)
});
let mut group = c.benchmark_group("searching in movies (19654 docs)");
group.sample_size(10);
for query in iter {
let bench_name = BenchmarkId::from_parameter(format!("{:?}", query));
group.bench_with_input(bench_name, &query, |b, query| b.iter(|| {
let builder = index.query_builder();
builder.query(&reader, query, 0..20).unwrap();
}));
}
group.finish();
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@ -1,7 +1,7 @@
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
use std::collections::btree_map::{BTreeMap, Entry};
use std::error::Error;
use std::io::Write;
use std::io::{Read, Write};
use std::iter::FromIterator;
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant};
@ -15,19 +15,23 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilisearch_core::{Database, Highlight, ProcessedUpdateResult};
use meilisearch_schema::SchemaAttr;
// #[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
#[derive(Debug, StructOpt)]
struct IndexCommand {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_uid: String,
/// The csv file to index.
#[structopt(parse(from_os_str))]
csv_data_path: PathBuf,
#[structopt(long, default_value = "default")]
index_uid: String,
/// The path to the schema.
#[structopt(long, parse(from_os_str))]
schema: PathBuf,
@ -135,7 +139,13 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
}
}
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
let mut rdr = if command.csv_data_path.as_os_str() == "-" {
csv::Reader::from_reader(Box::new(io::stdin()) as Box<dyn Read>)
} else {
let file = std::fs::File::open(command.csv_data_path)?;
csv::Reader::from_reader(Box::new(file) as Box<dyn Read>)
};
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();

View File

@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA {
pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}
pub fn build_exact_dfa(query: &str) -> DFA {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
builder.build_dfa(query)
}

View File

@ -2,21 +2,22 @@ mod dfa;
mod query_enhancer;
use std::cmp::Reverse;
use std::{cmp, vec};
use std::{cmp, fmt, vec};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
use meilisearch_tokenizer::{is_cjk, split_query_string};
use log::debug;
use crate::database::MainT;
use crate::error::MResult;
use crate::store;
use self::dfa::{build_dfa, build_prefix_dfa};
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
pub use self::query_enhancer::QueryEnhancer;
use self::query_enhancer::QueryEnhancerBuilder;
pub use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3;
pub const NGRAMS: usize = 3;
pub struct AutomatonProducer {
automatons: Vec<AutomatonGroup>,
@ -38,6 +39,10 @@ impl AutomatonProducer {
synonyms_store,
)?;
for (i, group) in automatons.iter().enumerate() {
debug!("all automatons: group {} automatons {:?}", i, group.automatons);
}
Ok((AutomatonProducer { automatons }, query_enhancer))
}
@ -68,7 +73,6 @@ impl AutomatonGroup {
}
}
#[derive(Debug)]
pub struct Automaton {
pub index: usize,
pub ngram: usize,
@ -78,6 +82,16 @@ pub struct Automaton {
pub query: String,
}
impl fmt::Debug for Automaton {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Automaton")
.field("index", &self.index)
.field("query", &self.query)
.field("is_prefix", &self.is_prefix)
.finish()
}
}
impl Automaton {
pub fn dfa(&self) -> DFA {
if self.is_prefix {
@ -131,7 +145,7 @@ pub fn normalize_str(string: &str) -> String {
string
}
fn split_best_frequency<'a>(
pub fn split_best_frequency<'a>(
reader: &heed::RoTxn<MainT>,
word: &'a str,
postings_lists_store: store::PostingsLists,

View File

@ -58,6 +58,7 @@ where
type Origin = usize;
type RealLength = usize;
#[derive(Debug)]
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
@ -142,67 +143,80 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin
.push((real_range, (range.start, real_length)));
self.real_to_origin.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
let interval_tree = FakeIntervalTree::new(self.real_to_origin);
let mut table = Vec::new();
for real in 0.. {
match replacement(&self.origins, &interval_tree, real) {
Some(range) => table.push(range),
None => break,
}
}
QueryEnhancer { table }
}
}
/// Returns the query indices that represent this real query index.
fn replacement(
origins: &[usize],
real_to_origin: &FakeIntervalTree,
real: u32,
) -> Option<Range<u32>>
{
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) = real_to_origin.query(real)?;
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 {
new_origin = origin + i;
break;
}
}
let n = real - range.start;
let start = origins[origin];
let end = origins.get(new_origin + 1)?;
let remaining = (end - start) - n;
Some(Range {
start: (start + n) as u32,
end: (start + n + remaining) as u32,
})
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = origins[origin];
Some(Range {
start: (origin + n) as u32,
end: (origin + n + 1) as u32,
})
}
}
#[derive(Debug)]
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
table: Vec<Range<u32>>,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
/// Returns the query indices that represent this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) = self
.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 {
new_origin = origin + i;
break;
}
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range {
start: (start + n) as u32,
end: (start + n + remaining) as u32,
}
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range {
start: (origin + n) as u32,
end: (origin + n + 1) as u32,
}
}
self.table[real as usize].clone()
}
}

View File

@ -0,0 +1,534 @@
use std::ops::Deref;
use std::fmt;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::HashSet;
use std::io::Write;
use std::mem;
use std::ops::Range;
use std::rc::Rc;
use std::time::{Duration, Instant};
use compact_arena::{SmallArena, Idx32, mk_arena};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
use log::debug;
use meilisearch_tokenizer::{is_cjk, split_query_string};
use meilisearch_types::{DocIndex, Highlight};
use sdset::Set;
use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::NGRAMS;
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::automaton::{normalize_str, split_best_frequency};
use crate::criterion2::*;
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult};
pub fn bucket_sort<'c>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
{
// let automatons = construct_automatons(query);
let (automatons, query_enhancer) =
construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?;
debug!("{:?}", query_enhancer);
let before_postings_lists_fetching = Instant::now();
mk_arena!(arena);
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
debug!("bare matches ({}) retrieved in {:.02?}",
bare_matches.len(),
before_postings_lists_fetching.elapsed(),
);
let before_raw_documents_presort = Instant::now();
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) {
raw_documents.push(raw_document);
}
}
debug!("creating {} candidates documents took {:.02?}",
raw_documents.len(),
before_raw_documents_building.elapsed(),
);
dbg!(mem::size_of::<BareMatch>());
dbg!(mem::size_of::<SimpleMatch>());
let mut groups = vec![raw_documents.as_mut_slice()];
let criteria = [
Box::new(Typo) as Box<dyn Criterion>,
Box::new(Words) as Box<dyn Criterion>,
Box::new(Proximity),
Box::new(Attribute),
Box::new(WordsPosition),
Box::new(Exact),
Box::new(StableDocId),
];
'criteria: for criterion in &criteria {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for mut group in tmp_groups {
let before_criterion_preparation = Instant::now();
criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons);
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) {
debug!("{:?} produced a group of size {}", criterion.name(), group.len());
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end {
continue 'criteria;
}
}
}
}
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
let iter = iter.map(|d| {
let highlights = d.raw_matches.iter().flat_map(|sm| {
let postings_list = &arena[sm.postings_list];
let input = postings_list.input();
let query = &automatons[sm.query_index as usize].query;
debug!("{:?} contains {:?}", d.raw_matches[0].document_id, query);
postings_list.iter().map(move |m| {
let covered_area = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 }
})
}).collect();
debug!("{:?} contains {:?}", d.raw_matches[0].document_id, d.processed_distances);
Document {
id: d.raw_matches[0].document_id,
highlights,
#[cfg(test)] matches: Vec::new(),
}
});
Ok(iter.collect())
}
pub struct RawDocument<'a, 'tag> {
pub raw_matches: &'a mut [BareMatch<'tag>],
pub processed_matches: Vec<SimpleMatch>,
/// The list of minimum `distance` found
pub processed_distances: Vec<Option<u8>>,
}
impl<'a, 'tag> RawDocument<'a, 'tag> {
fn new<'txn>(
raw_matches: &'a mut [BareMatch<'tag>],
automatons: &[QueryWordAutomaton],
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> Option<RawDocument<'a, 'tag>>
{
raw_matches.sort_unstable_by_key(|m| m.query_index);
// debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches);
let mut previous_word = None;
for i in 0..raw_matches.len() {
let a = &raw_matches[i];
let auta = &automatons[a.query_index as usize];
match auta.phrase_query {
Some((0, _)) => {
previous_word = Some(a.query_index);
let b = raw_matches.get(i + 1)?;
if a.query_index + 1 != b.query_index {
return None;
}
let pla = &postings_lists[a.postings_list];
let plb = &postings_lists[b.postings_list];
let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
});
if !iter.any(|eb| eb.is_both()) { return None }
},
Some((1, _)) => {
if previous_word.take() != Some(a.query_index - 1) {
return None;
}
},
Some((_, _)) => unreachable!(),
None => (),
}
}
Some(RawDocument {
raw_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
})
}
}
pub struct BareMatch<'tag> {
pub document_id: DocumentId,
pub query_index: u16,
pub distance: u8,
pub is_exact: bool,
pub postings_list: Idx32<'tag>,
}
impl fmt::Debug for BareMatch<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BareMatch")
.field("document_id", &self.document_id)
.field("query_index", &self.query_index)
.field("distance", &self.distance)
.field("is_exact", &self.is_exact)
.finish()
}
}
// TODO remove that
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct SimpleMatch {
pub query_index: u16,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Clone)]
pub struct PostingsListView<'txn> {
input: Rc<[u8]>,
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
offset: usize,
len: usize,
}
impl fmt::Debug for PostingsListView<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("PostingsListView")
.field("input", &std::str::from_utf8(&self.input).unwrap())
.field("postings_list", &self.as_ref())
.finish()
}
}
impl<'txn> PostingsListView<'txn> {
pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = postings_list.len();
PostingsListView { input, postings_list, offset: 0, len }
}
pub fn len(&self) -> usize {
self.len
}
pub fn input(&self) -> &[u8] {
&self.input
}
pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> {
assert!(offset + len <= self.len);
PostingsListView {
input: self.input.clone(),
postings_list: self.postings_list.clone(),
offset: self.offset + offset,
len: len,
}
}
}
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
fn as_ref(&self) -> &Set<DocIndex> {
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
}
}
impl Deref for PostingsListView<'_> {
type Target = Set<DocIndex>;
fn deref(&self) -> &Set<DocIndex> {
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
}
}
fn fetch_matches<'txn, 'tag>(
reader: &'txn heed::RoTxn<MainT>,
automatons: &[QueryWordAutomaton],
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
) -> MResult<Vec<BareMatch<'tag>>>
{
let mut before_words_fst = Instant::now();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
debug!("words fst took {:.02?}", before_words_fst.elapsed());
let mut total_postings_lists = Vec::new();
let mut dfa_time = Duration::default();
let mut stream_next_time = Duration::default();
let mut postings_lists_fetching_time = Duration::default();
for (query_index, automaton) in automatons.iter().enumerate() {
let before_dfa = Instant::now();
let dfa = automaton.dfa();
let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton;
dfa_time += before_dfa.elapsed();
let mut number_of_words = 0;
let mut stream = words.search(&dfa).into_stream();
// while let Some(input) = stream.next() {
loop {
let before_stream_next = Instant::now();
let input = match stream.next() {
Some(input) => input,
None => break,
};
stream_next_time += before_stream_next.elapsed();
number_of_words += 1;
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
let before_postings_lists_fetching = Instant::now();
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
let input = Rc::from(input);
let postings_list = Rc::new(postings_list);
let postings_list_view = PostingsListView::new(input, postings_list);
let mut offset = 0;
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
let document_id = group[0].document_id;
let bare_match = BareMatch {
document_id,
query_index: query_index as u16,
distance,
is_exact,
postings_list: posting_list_index,
};
total_postings_lists.push(bare_match);
offset += group.len();
}
}
postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
}
debug!("{:?} gives {} words", query, number_of_words);
}
debug!("stream next took {:.02?}", stream_next_time);
debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
debug!("dfa creation took {:.02?}", dfa_time);
Ok(total_postings_lists)
}
#[derive(Debug)]
pub struct QueryWordAutomaton {
pub query: String,
/// Is it a word that must be considered exact
/// or is it some derived word (i.e. a synonym)
pub is_exact: bool,
pub is_prefix: bool,
/// If it's a phrase query and what is
/// its index an the length of the phrase
pub phrase_query: Option<(u16, u16)>,
}
impl QueryWordAutomaton {
pub fn exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: false,
phrase_query: None,
}
}
pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: true,
phrase_query: None,
}
}
pub fn non_exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: false,
is_prefix: false,
phrase_query: None,
}
}
pub fn dfa(&self) -> DFA {
if self.phrase_query.is_some() {
build_exact_dfa(&self.query)
} else if self.is_prefix {
build_prefix_dfa(&self.query)
} else {
build_dfa(&self.query)
}
}
}
fn construct_automatons2(
reader: &heed::RoTxn<MainT>,
query: &str,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms,
) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
Some(synonym) => synonym,
None => fst::Set::default(),
};
let mut automaton_index = 0;
let mut automatons = Vec::new();
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
// We must not declare the original words to the query enhancer
// *but* we need to push them in the automatons list first
let mut original_words = query_words.iter().peekable();
while let Some(word) = original_words.next() {
let has_following_word = original_words.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
let automaton = if not_prefix_dfa {
QueryWordAutomaton::exact(word)
} else {
QueryWordAutomaton::exact_prefix(word)
};
automaton_index += 1;
automatons.push(automaton);
}
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
while let Some((query_index, ngram_slice)) = ngrams.next() {
let query_range = query_index..query_index + n;
let ngram_nb_words = ngram_slice.len();
let ngram = ngram_slice.join(" ");
let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa =
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
// automaton of synonyms of the ngrams
let normalized = normalize_str(&ngram);
let lev = if not_prefix_dfa {
build_dfa(&normalized)
} else {
build_prefix_dfa(&normalized)
};
let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() {
// only trigger alternatives when the last word has been typed
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
let base = std::str::from_utf8(base).unwrap();
let base_nb_words = split_query_string(base).count();
if ngram_nb_words != base_nb_words {
continue;
}
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
let mut stream = synonyms.into_stream();
while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap();
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
let nb_synonym_words = synonyms_words.len();
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
for synonym in synonyms_words {
let automaton = if nb_synonym_words == 1 {
QueryWordAutomaton::exact(synonym)
} else {
QueryWordAutomaton::non_exact(synonym)
};
automaton_index += 1;
automatons.push(automaton);
}
}
}
}
if true && n == 1 {
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let mut left_automaton = QueryWordAutomaton::exact(left);
left_automaton.phrase_query = Some((0, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(left_automaton);
let mut right_automaton = QueryWordAutomaton::exact(right);
right_automaton.phrase_query = Some((1, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
automaton_index += 1;
automatons.push(right_automaton);
}
} else {
// automaton of concatenation of query words
let concat = ngram_slice.concat();
let normalized = normalize_str(&concat);
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
let automaton = QueryWordAutomaton::exact(&normalized);
automaton_index += 1;
automatons.push(automaton);
}
}
}
Ok((automatons, enhancer_builder.build()))
}

View File

@ -1,18 +1,17 @@
use std::cmp::Ordering;
use meilisearch_schema::SchemaAttr;
use sdset::Set;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
use crate::{AttrCount, RawDocument};
#[inline]
fn number_exact_matches(
query_index: &[u32],
attribute: &[u16],
is_exact: &[bool],
fields_counts: &Set<(SchemaAttr, u64)>,
fields_counts: &Set<AttrCount>,
) -> usize {
let mut count = 0;
let mut index = 0;
@ -25,8 +24,8 @@ fn number_exact_matches(
if *is_exact {
found_exact = true;
let attr = &attribute[index + pos];
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
let (_, count) = fields_counts[pos];
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) {
let AttrCount { count, .. } = fields_counts[pos];
if count == 1 {
return usize::max_value();
}
@ -50,7 +49,7 @@ impl Criterion for Exact {
let query_index = lhs.query_index();
let is_exact = lhs.is_exact();
let attribute = lhs.attribute();
let fields_counts = &lhs.fields_counts;
let fields_counts = lhs.fields_counts.as_ref().unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
@ -59,7 +58,7 @@ impl Criterion for Exact {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
let attribute = rhs.attribute();
let fields_counts = &rhs.fields_counts;
let fields_counts = rhs.fields_counts.as_ref().unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
@ -86,7 +85,7 @@ mod tests {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
@ -95,7 +94,7 @@ mod tests {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[false];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
@ -113,7 +112,7 @@ mod tests {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
@ -122,7 +121,7 @@ mod tests {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};

View File

@ -0,0 +1,588 @@
use std::cmp::{self, Ordering, Reverse};
use std::borrow::Cow;
use std::sync::atomic::{self, AtomicUsize};
use slice_group_by::{GroupBy, GroupByMut};
use compact_arena::SmallArena;
use sdset::{Set, SetBuf};
use log::debug;
use crate::{DocIndex, DocumentId};
use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton};
use crate::automaton::QueryEnhancer;
type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>;
pub trait Criterion {
fn name(&self) -> &str;
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
);
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering;
#[inline]
fn eq<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> bool
{
self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal
}
}
fn prepare_query_distances<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
postings_lists: &PostingsListsArena<'tag, 'txn>,
) {
for document in documents {
if !document.processed_distances.is_empty() { continue }
// debug!("{:?}", document.raw_matches[0].document_id);
let mut processed = Vec::new();
let mut raw_matches = document.raw_matches.iter().peekable();
while let Some(m) = raw_matches.next() {
// let automaton = &automatons[m.query_index as usize];
// debug!("{:?} {:?}", m, automaton);
// debug!("{:?}", &postings_lists[m.postings_list]);
// match automaton.phrase_query {
// Some((0, len)) => {
// match raw_matches.peek() {
// Some(BareMatch { query_index, .. }) => {
// if *query_index != m.query_index + 1 {
// raw_matches.next();
// continue
// }
// },
// None => continue,
// }
// },
// Some((_, _)) => continue,
// None => (),
// }
// FIXME we really need to take splitted words into account
// those must be seen at the same level as the non-splitteds
// if automatons[m.query_index as usize].phrase_query.is_some() {
// continue
// }
let range = query_enhancer.replacement(m.query_index as u32);
let new_len = cmp::max(range.end as usize, processed.len());
processed.resize(new_len, None);
for index in range {
let index = index as usize;
processed[index] = match processed[index] {
Some(distance) if distance > m.distance => Some(m.distance),
Some(distance) => Some(distance),
None => Some(m.distance),
};
}
}
// debug!("{:?}", processed);
document.processed_distances = processed;
}
}
pub struct Typo;
impl Criterion for Typo {
fn name(&self) -> &str { "typo" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn compute_typos(distances: &[Option<u8>]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
for distance in distances {
if let Some(distance) = distance {
sum_typos += custom_log10(*distance);
number_words += 1;
}
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
let lhs = compute_typos(&lhs.processed_distances);
let rhs = compute_typos(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}
pub struct Words;
impl Criterion for Words {
fn name(&self) -> &str { "words" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
#[inline]
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
distances.iter().cloned().filter(Option::is_some).count()
}
let lhs = number_of_query_words(&lhs.processed_distances);
let rhs = number_of_query_words(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}
fn prepare_raw_matches<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
for document in documents {
if !document.processed_matches.is_empty() { continue }
let mut processed = Vec::new();
for m in document.raw_matches.iter() {
let postings_list = &postings_lists[m.postings_list];
processed.reserve(postings_list.len());
for di in postings_list.as_ref() {
let simple_match = SimpleMatch {
query_index: m.query_index,
distance: m.distance,
attribute: di.attribute,
word_index: di.word_index,
is_exact: m.is_exact,
};
processed.push(simple_match);
}
}
let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons);
document.processed_matches = processed.into_vec();
}
}
pub struct Proximity;
impl Criterion for Proximity {
fn name(&self) -> &str { "proximity" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering
{
const MAX_DISTANCE: u16 = 8;
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
else { index_proximity(lhs.word_index, rhs.word_index) }
}
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
let mut min_prox = u16::max_value();
for a in lhs {
for b in rhs {
let prox = attribute_proximity(*a, *b);
min_prox = cmp::min(min_prox, prox);
}
}
min_prox
}
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
let mut proximity = 0;
let mut iter = matches.linear_group_by_key(|m| m.query_index);
// iterate over groups by windows of size 2
let mut last = iter.next();
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
proximity += min_proximity(lhs, rhs);
last = Some(rhs);
}
proximity
}
let lhs = matches_proximity(&lhs.processed_matches);
let rhs = matches_proximity(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}
pub struct Attribute;
impl Criterion for Attribute {
fn name(&self) -> &str { "attribute" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering
{
#[inline]
fn best_attribute(matches: &[SimpleMatch]) -> u16 {
let mut best_attribute = u16::max_value();
for group in matches.linear_group_by_key(|bm| bm.query_index) {
best_attribute = cmp::min(best_attribute, group[0].attribute);
}
best_attribute
}
let lhs = best_attribute(&lhs.processed_matches);
let rhs = best_attribute(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}
pub struct WordsPosition;
impl Criterion for WordsPosition {
fn name(&self) -> &str { "words position" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering
{
#[inline]
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
let mut sum_words_position = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_words_position += group[0].word_index as usize;
}
sum_words_position
}
let lhs = sum_words_position(&lhs.processed_matches);
let rhs = sum_words_position(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}
pub struct Exact;
impl Criterion for Exact {
fn name(&self) -> &str { "exact" }
fn prepare(
&self,
documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
for document in documents {
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
}
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
#[inline]
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
let mut sum_exact_query_words = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_exact_query_words += group[0].is_exact as usize;
}
sum_exact_query_words
}
let lhs = sum_exact_query_words(&lhs.raw_matches);
let rhs = sum_exact_query_words(&rhs.raw_matches);
lhs.cmp(&rhs).reverse()
}
}
pub struct StableDocId;
impl Criterion for StableDocId {
fn name(&self) -> &str { "stable document id" }
fn prepare(
&self,
documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
// ...
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
let lhs = &lhs.raw_matches[0].document_id;
let rhs = &rhs.raw_matches[0].document_id;
lhs.cmp(rhs)
}
}
pub fn multiword_rewrite_matches(
simple_matches: &mut [SimpleMatch],
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) -> SetBuf<SimpleMatch>
{
let mut matches = Vec::with_capacity(simple_matches.len());
// let before_sort = Instant::now();
// we sort the matches by word index to make them rewritable
simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index));
// debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) {
let iter = same_attribute.linear_group_by_key(|m| m.query_index);
let mut iter = iter.peekable();
while let Some(same_query_index) = iter.next() {
let query_index = same_query_index[0].query_index;
// TODO we need to support phrase query of longer length
if let Some((i, len)) = automatons[query_index as usize].phrase_query {
if i != 0 { continue }
// is the next query_index group the required one
if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) {
if let Some(next) = iter.next() {
for ma in same_query_index {
for mb in next {
if ma.word_index == mb.word_index + 1 {
matches.push(*ma);
matches.push(*mb);
}
}
}
}
}
} else {
matches.extend_from_slice(same_query_index);
}
}
}
// let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some();
// let next_query_index = match_.query_index + 1;
// if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() {
// continue
// }
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
let mut padded_matches = Vec::with_capacity(matches.len());
// let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for match_ in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
for nmatch_ in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
let query_index = rep.next().unwrap() as u16;
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i)
{
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
biggest = biggest.max(i + 1);
}
}
padded_matches.push(padmatch);
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
// debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::from_dirty(padded_matches)
}

View File

@ -18,6 +18,10 @@ pub mod serde;
pub mod store;
mod update;
// TODO replace
mod bucket_sort;
mod criterion2;
pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT};
pub use self::error::{Error, MResult};
pub use self::number::{Number, ParseNumberError};
@ -25,7 +29,7 @@ pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount};
#[doc(hidden)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]

View File

@ -6,15 +6,16 @@ use std::time::{Duration, Instant};
use std::{cmp, mem};
use fst::{IntoStreamer, Streamer};
use log::debug;
use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut};
use crate::database::MainT;
use crate::{bucket_sort::bucket_sort, database::MainT};
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
pub struct QueryBuilder<'c, 'f, 'd> {
@ -35,9 +36,12 @@ fn multiword_rewrite_matches(
) -> SetBuf<(DocumentId, TmpMatch)> {
let mut padded_matches = Vec::with_capacity(matches.len());
let before_sort = Instant::now();
// we sort the matches by word index to make them rewritable
matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
// padding will only be applied
@ -136,6 +140,12 @@ fn multiword_rewrite_matches(
document_matches.sort_unstable();
}
debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::new_unchecked(padded_matches)
}
@ -146,52 +156,64 @@ fn fetch_raw_documents(
searchables: Option<&ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
) -> MResult<Vec<RawDocument>> {
let mut matches = Vec::new();
let mut highlights = Vec::new();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
let before_automatons_groups_loop = Instant::now();
let mut doc_indexes_rewrite = Duration::default();
let mut retrieve_postings_lists = Duration::default();
let mut stream_reserve = Duration::default();
let mut covered_area_time = Duration::default();
let mut eval_time = Duration::default();
for group in automatons_groups {
let AutomatonGroup {
is_phrase_query,
automatons,
} = group;
let AutomatonGroup { is_phrase_query, automatons } = group;
let phrase_query_len = automatons.len();
let mut tmp_matches = Vec::new();
for (id, automaton) in automatons.into_iter().enumerate() {
let Automaton {
index,
is_exact,
query_len,
query,
..
} = automaton;
let Automaton { index, is_exact, query_len, query, .. } = automaton;
let dfa = automaton.dfa();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
let before_stream_loop = Instant::now();
let mut stream_count = 0;
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let before_eval_time = Instant::now();
let distance = dfa.eval(input).to_u8();
eval_time += before_eval_time.elapsed();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
stream_count += 1;
let before_covered_area = Instant::now();
let covered_area = if *query_len > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
covered_area_time += before_covered_area.elapsed();
let before_retrieve_postings_lists = Instant::now();
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
retrieve_postings_lists += before_retrieve_postings_lists.elapsed();
let before_stream_reserve = Instant::now();
tmp_matches.reserve(doc_indexes.len());
stream_reserve += before_stream_reserve.elapsed();
let before_doc_indexes_rewrite = Instant::now();
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
@ -215,7 +237,9 @@ fn fetch_raw_documents(
tmp_matches.push((di.document_id, id, match_, highlight));
}
}
doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed();
}
debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count);
}
if *is_phrase_query {
@ -244,32 +268,52 @@ fn fetch_raw_documents(
}
}
} else {
let before_rerewrite = Instant::now();
matches.reserve(tmp_matches.len());
highlights.reserve(tmp_matches.len());
for (id, _, match_, highlight) in tmp_matches {
matches.push((id, match_));
highlights.push((id, highlight));
}
debug!("rerewrite took {:.02?}", before_rerewrite.elapsed());
}
}
debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite);
debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists);
debug!("stream reserve took {:.02?}", stream_reserve);
debug!("covered area took {:.02?}", covered_area_time);
debug!("eval value took {:.02?}", eval_time);
// {
// let mut cloned = matches.clone();
// let before_sort_test = Instant::now();
// cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance));
// debug!("sorting test took {:.02?}", before_sort_test.elapsed());
// }
let before_multiword_rewrite_matches = Instant::now();
debug!("number of matches before rewrite {}", matches.len());
debug!("{:?}", query_enhancer);
let matches = multiword_rewrite_matches(matches, &query_enhancer);
debug!("number of matches after rewrite {}", matches.len());
debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
let before_highlight_sorting = Instant::now();
let highlights = {
highlights.sort_unstable_by_key(|(id, _)| *id);
SetBuf::new_unchecked(highlights)
};
debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());
let fields_counts = {
let mut fields_counts = Vec::new();
for group in matches.linear_group_by_key(|(id, ..)| *id) {
let id = group[0].0;
for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
let (attr, count) = result?;
fields_counts.push((id, attr, count));
}
}
SetBuf::new(fields_counts).unwrap()
};
let before_raw_documents = Instant::now();
let raw_documents = raw_documents_from(matches, highlights);
debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
debug!("documents to worry about: {}", raw_documents.len());
Ok(raw_documents_from(matches, highlights, fields_counts))
Ok(raw_documents)
}
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
@ -307,9 +351,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
synonyms_store: synonyms,
}
}
}
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn with_filter<F>(&mut self, function: F)
where
F: Fn(DocumentId) -> bool + 'f,
@ -342,29 +384,12 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
range: Range<usize>,
) -> MResult<Vec<Document>> {
match self.distinct {
Some((distinct, distinct_size)) => raw_query_with_distinct(
Some((distinct, distinct_size)) => unimplemented!("distinct"),
None => bucket_sort(
reader,
query,
range,
self.filter,
distinct,
distinct_size,
self.timeout,
self.criteria,
self.searchable_attrs,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,
self.synonyms_store,
),
None => raw_query(
reader,
query,
range,
self.filter,
self.timeout,
self.criteria,
self.searchable_attrs,
// self.criteria,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,
@ -434,6 +459,11 @@ where
for auts in automaton_producer {
automatons.push(auts);
for (i, group) in automatons.iter().enumerate() {
debug!("group {} automatons {:?}", i, group.automatons);
}
let before_fetch_raw_documents = Instant::now();
// we must retrieve the documents associated
// with the current automatons
let mut raw_documents = fetch_raw_documents(
@ -443,8 +473,8 @@ where
searchable_attrs.as_ref(),
main_store,
postings_lists_store,
documents_fields_counts_store,
)?;
debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed());
// stop processing when time is running out
if let Some(timeout) = timeout {
@ -453,6 +483,8 @@ where
}
}
let before_bucket_sort = Instant::now();
let mut groups = vec![raw_documents.as_mut_slice()];
'criteria: for criterion in criteria.as_ref() {
@ -468,12 +500,30 @@ where
continue;
}
// we must pull the fields counts of these documents
// TODO it would be great to had a "dependency" thing for each criterion
// and make it so that we can be lazy on pulling/computing some data.
if criterion.name() == "Exact" {
for document in group.iter_mut() {
let mut fields_counts = Vec::new();
for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? {
let (attr, count) = result?;
fields_counts.push(AttrCount { attr: attr.0, count });
}
document.fields_counts = Some(SetBuf::new(fields_counts).unwrap());
}
}
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
debug!("criterion {} produced a group of size {}", criterion.name(), group.len());
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end {
@ -483,6 +533,8 @@ where
}
}
debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed());
// once we classified the documents related to the current
// automatons we save that as the next valid result
let iter = raw_documents
@ -561,7 +613,6 @@ where
searchable_attrs.as_ref(),
main_store,
postings_lists_store,
documents_fields_counts_store,
)?;
// stop processing when time is running out
@ -815,7 +866,7 @@ mod tests {
let mut words_fst = BTreeSet::new();
let mut postings_lists = HashMap::new();
let mut fields_counts = HashMap::<_, u64>::new();
let mut fields_counts = HashMap::<_, u16>::new();
for (word, indexes) in iter {
let word = word.to_lowercase().into_bytes();

View File

@ -1,398 +0,0 @@
use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
/// or if there is fewer replacement words than the range to replace.
//
//
// ## Ignored because already present in original
//
// new york city subway
// -------- ^^^^
// / \
// [new york city]
//
//
// ## Ignored because smaller than the original
//
// new york city subway
// -------------
// \ /
// [new york]
//
//
// ## Accepted because bigger than the original
//
// NYC subway
// ---
// / \
// / \
// / \
// / \
// / \
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false
}
// retrieve the part to rewrite but with the length
// of the replacement part
let original = query.iter().skip(range.start).take(words.len());
// check if the original query doesn't already contain
// the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
type RealLength = usize;
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl FakeIntervalTree {
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
FakeIntervalTree { intervals }
}
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end { Equal } else { Less }
} else { Greater }
});
let n = match element { Ok(n) => n, Err(n) => n };
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
_otherwise => None,
}
}
}
pub struct QueryEnhancerBuilder<'a, S> {
query: &'a [S],
origins: Vec<usize>,
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin }
}
/// Update the final real to origin query indices mapping.
///
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
let previous_padding = self.origins[range.end - 1];
let current_offset = (self.origins[range.end] - 1) - previous_padding;
let diff = offset.saturating_sub(current_offset);
self.origins[range.end] += diff;
for r in &mut self.origins[range.end + 1..] {
*r += diff;
}
}
// we need to store the real number and origins relations
// this way it will be possible to know by how many
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
}
}
}
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) =
self.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break }
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..2); // york
assert_eq!(enhancer.replacement(2), 2..3); // city
assert_eq!(enhancer.replacement(3), 3..4); // subway
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..3); // york
assert_eq!(enhancer.replacement(2), 3..4); // subway
assert_eq!(enhancer.replacement(3), 0..1); // new
assert_eq!(enhancer.replacement(4), 1..2); // york
assert_eq!(enhancer.replacement(5), 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NY
assert_eq!(enhancer.replacement(1), 3..5); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..3); // york
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
assert_eq!(enhancer.replacement(7), 0..3); // NYC
assert_eq!(enhancer.replacement(8), 0..1); // new
assert_eq!(enhancer.replacement(9), 1..2); // york
assert_eq!(enhancer.replacement(10), 2..3); // city
assert_eq!(enhancer.replacement(11), 3..4); // underground
assert_eq!(enhancer.replacement(12), 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NYC
assert_eq!(enhancer.replacement(1), 3..4); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..2); // york
assert_eq!(enhancer.replacement(4), 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..6); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // NYC
assert_eq!(enhancer.replacement(1), 1..3); // subway
assert_eq!(enhancer.replacement(2), 1..2); // underground
assert_eq!(enhancer.replacement(3), 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}
}

View File

@ -1,18 +1,17 @@
use std::fmt;
use std::sync::Arc;
use meilisearch_schema::SchemaAttr;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{DocumentId, Highlight, TmpMatch};
use crate::{DocumentId, Highlight, TmpMatch, AttrCount};
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
pub highlights: Vec<Highlight>,
pub fields_counts: SetBuf<(SchemaAttr, u64)>,
pub fields_counts: Option<SetBuf<AttrCount>>,
}
impl RawDocument {
@ -100,27 +99,33 @@ impl fmt::Debug for RawDocument {
pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>,
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
highlights: SetBuf<(DocumentId, Highlight)>
) -> Vec<RawDocument> {
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
for (mgroup, hgroup) in matches.zip(highlights) {
assert_eq!(mgroup[0].0, hgroup[0].0);
let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
let fields_counts = None;
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
// TODO we could try to keep both data
// - the data oriented one and,
// - the raw one, the one that comes from the arguments of this function
// This way we would be able to only produce data oriented lazily.
//
// For example the default first criterion is `SumOfTypos`
// and just needs the `query_index` and the `distance` fields.
// It would probably be good to avoid wasting time sorting other fields of documents
// that will never ever reach the second criterion.
matches2.extend_from_slice(mgroup);
}
@ -128,16 +133,8 @@ pub fn raw_documents_from(
docs_ranges
.into_iter()
.map(|(id, range, highlights, fields_counts)| {
let matches = SharedMatches {
range,
matches: matches.clone(),
};
RawDocument {
id,
matches,
highlights,
fields_counts,
}
let matches = SharedMatches { range, matches: matches.clone() };
RawDocument { id, matches, highlights, fields_counts }
})
.collect()
}

View File

@ -325,7 +325,7 @@ where
txn,
document_id,
attribute,
number_of_words as u64,
number_of_words as u16,
)?;
}
}

View File

@ -7,7 +7,7 @@ use meilisearch_schema::SchemaAttr;
#[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts {
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u16>>,
}
impl DocumentsFieldsCounts {
@ -16,7 +16,7 @@ impl DocumentsFieldsCounts {
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
attribute: SchemaAttr,
value: u64,
value: u16,
) -> ZResult<()> {
let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields_counts.put(writer, &key, &value)
@ -42,7 +42,7 @@ impl DocumentsFieldsCounts {
reader: &heed::RoTxn<MainT>,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<u64>> {
) -> ZResult<Option<u16>> {
let key = DocumentAttrKey::new(document_id, attribute);
match self.documents_fields_counts.get(reader, &key)? {
Some(count) => Ok(Some(count)),
@ -79,11 +79,11 @@ impl DocumentsFieldsCounts {
}
pub struct DocumentFieldsCountsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
}
impl Iterator for DocumentFieldsCountsIter<'_> {
type Item = ZResult<(SchemaAttr, u64)>;
type Item = ZResult<(SchemaAttr, u16)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
@ -99,7 +99,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
pub struct DocumentsIdsIter<'txn> {
last_seen_id: Option<DocumentId>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
}
impl Iterator for DocumentsIdsIter<'_> {
@ -123,11 +123,11 @@ impl Iterator for DocumentsIdsIter<'_> {
}
pub struct AllDocumentsFieldsCountsIter<'txn> {
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
}
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
type Item = ZResult<(DocumentId, SchemaAttr, u16)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {

View File

@ -63,3 +63,11 @@ pub struct Highlight {
/// without needing to run the tokenizer again.
pub char_length: u16,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
#[repr(C)]
pub struct AttrCount {
pub attr: u16,
pub count: u16,
}