Introduce ContextMut and Context structs

Prefer summing the attribute
Introduce bucket_sort_with_distinct function
2025-07-18 04:11:07 +00:00 · 2019-12-12 11:33:39 +01:00 · 2019-12-11 18:37:26 +01:00 · 2019-12-11 18:19:54 +01:00 · 2019-12-11 17:02:10 +01:00 · 2019-12-11 15:34:30 +01:00
31 changed files with 1634 additions and 2177 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 /target
+meilisearch-core/target
 **/*.csv
 **/*.json_lines
 **/*.rs.bk
--- a/Cargo.lock
+++ b/Cargo.lock
@ -196,6 +196,14 @@ dependencies = [
 "ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "cast"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "cc"
 version = "1.0.47"
@ -249,6 +257,11 @@ dependencies = [
 "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "compact_arena"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "const-random"
 version = "0.1.6"
@ -284,6 +297,39 @@ dependencies = [
 "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "criterion"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)",
+ "cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "criterion-plot 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand_os 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand_xoshiro 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
+ "serde_derive 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
+ "serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
+ "tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.4.0"
@ -760,6 +806,14 @@ dependencies = [
 "libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "itertools"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "itoa"
 version = "0.4.4"
@ -887,6 +941,8 @@ dependencies = [
 "bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)",
+ "compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "deunicode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -895,6 +951,8 @@ dependencies = [
 "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "heed 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
 "meilisearch-schema 0.8.2",
@ -903,7 +961,7 @@ dependencies = [
 "once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
- "sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
 "siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1439,6 +1497,15 @@ dependencies = [
 "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "rand_os"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "getrandom 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "rand_pcg"
 version = "0.1.2"
@ -1456,6 +1523,14 @@ dependencies = [
 "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "rand_xoshiro"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "rayon"
 version = "1.2.0"
@ -1616,7 +1691,7 @@ dependencies = [

 [[package]]
 name = "sdset"
-version = "0.3.3"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"

 [[package]]
@ -2044,6 +2119,15 @@ dependencies = [
 "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "tinytemplate"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
+ "serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "tokio"
 version = "0.1.22"
@ -2564,16 +2648,20 @@ dependencies = [
 "checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
 "checksum bytes 0.4.12 (registry+https://github.com/rust-lang/crates.io-index)" = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c"
 "checksum c2-chacha 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "214238caa1bf3a496ec3392968969cab8549f96ff30652c9e56885329315f6bb"
+"checksum cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
 "checksum cc 1.0.47 (registry+https://github.com/rust-lang/crates.io-index)" = "aa87058dce70a3ff5621797f1506cb837edd02ac4c0ae642b4542dce802908b8"
 "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 "checksum chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e8493056968583b0193c1bb04d6f7684586f3726992d6c573261941a895dbd68"
 "checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c"
 "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
+"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141"
 "checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe"
 "checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59"
 "checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5"
 "checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
+"checksum criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "938703e165481c8d612ea3479ac8342e5615185db37765162e762ec3523e2fc6"
+"checksum criterion-plot 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "eccdc6ce8bbe352ca89025bee672aa6d24f4eb8c53e3a8b5d1bc58011da072a2"
 "checksum crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "acec9a3b0b3559f15aee4f90746c4e5e293b701c0f7d3925d24e01645267b68c"
 "checksum crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3aa945d63861bfe624b55d153a39684da1e8c0bc8fba932f7ee3a3c16cea3ca"
 "checksum crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5064ebdbf05ce3cb95e45c8b086f72263f4166b29b97f6baff7ef7fe047b55ac"
@ -2627,6 +2715,7 @@ dependencies = [
 "checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
 "checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
 "checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
+"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
 "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
 "checksum jemalloc-sys 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45"
 "checksum jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69"
@ -2694,8 +2783,10 @@ dependencies = [
 "checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08"
 "checksum rand_jitter 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b"
 "checksum rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071"
+"checksum rand_os 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a788ae3edb696cfcba1c19bfd388cc4b8c21f8a408432b199c072825084da58a"
 "checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
 "checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
+"checksum rand_xoshiro 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0e18c91676f670f6f0312764c759405f13afb98d5d73819840cf72a518487bff"
 "checksum rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "83a27732a533a1be0a0035a111fe76db89ad312f6f0347004c220c57f209a123"
 "checksum rayon-core 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "98dcf634205083b17d0861252431eb2acbfb698ab7478a2d20de07954f47ec7b"
 "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
@ -2715,7 +2806,7 @@ dependencies = [
 "checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421"
 "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
 "checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c"
-"checksum sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b6d2447743d6c37b6d67af88d9c0f1fc92989e2d9745d9b2f3d305b906a90195"
+"checksum sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5bfd7aab2bcae693c563b40fbbaf87d60c9b6f2a60d55ed69a9c761e3d4c63c9"
 "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
 "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
 "checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0"
@ -2760,6 +2851,7 @@ dependencies = [
 "checksum tide-querystring 0.1.0 (git+https://github.com/rustasync/tide?rev=e77709370bb24cf776fe6da902467c35131535b1)" = "<none>"
 "checksum tide-slog 0.1.0 (git+https://github.com/rustasync/tide?rev=e77709370bb24cf776fe6da902467c35131535b1)" = "<none>"
 "checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f"
+"checksum tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4574b75faccaacddb9b284faecdf0b544b80b6b294f3d062d325c5726a209c20"
 "checksum tokio 0.1.22 (registry+https://github.com/rust-lang/crates.io-index)" = "5a09c0b5bb588872ab2f09afa13ee6e9dac11e10a0ec9e8e3ba39a5a5d530af6"
 "checksum tokio-buf 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8fb220f46c53859a4b7ec083e41dec9778ff0b1851c0942b211edb89e0ccdc46"
 "checksum tokio-current-thread 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "d16217cad7f1b840c5a97dfb3c43b0c871fef423a6e8d2118c604e843662a443"
--- a/meilisearch-core/Cargo.toml
+++ b/meilisearch-core/Cargo.toml
@ -9,12 +9,14 @@ arc-swap = "0.4.3"
 bincode = "1.1.4"
 byteorder = "1.3.2"
 chrono = { version = "0.4.9", features = ["serde"] }
+compact_arena = "0.4.0"
 crossbeam-channel = "0.4.0"
 deunicode = "1.0.0"
 env_logger = "0.7.0"
 fst = { version = "0.3.5", default-features = false }
 hashbrown = { version = "0.6.0", features = ["serde"] }
 heed = "0.6.0"
+itertools = "0.8.2" # kill me please
 levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
 log = "0.4.8"
 meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.2" }
@ -22,7 +24,7 @@ meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.8.2" }
 meilisearch-types = { path = "../meilisearch-types", version = "0.8.2" }
 once_cell = "1.2.0"
 ordered-float = { version = "1.0.2", features = ["serde"] }
-sdset = "0.3.3"
+sdset = "0.3.6"
 serde = { version = "1.0.101", features = ["derive"] }
 serde_json = "1.0.41"
 siphasher = "0.3.1"
@ -31,10 +33,16 @@ zerocopy = "0.2.8"

 [dev-dependencies]
 assert_matches = "1.3"
+criterion = "0.3"
 csv = "1.0.7"
 indexmap = { version = "1.2.0", features = ["serde-1"] }
+jemallocator = "0.3.2"
 rustyline = { version = "5.0.0", default-features = false }
 structopt = "0.3.2"
 tempfile = "3.1.0"
 termcolor = "1.0.4"
 toml = "0.5.3"
+
+[[bench]]
+name = "search_benchmark"
+harness = false
--- a/meilisearch-core/benches/search_benchmark.rs
+++ b/meilisearch-core/benches/search_benchmark.rs
@ -0,0 +1,95 @@
+#[cfg(test)]
+#[macro_use]
+extern crate assert_matches;
+
+use std::sync::mpsc;
+use std::path::Path;
+use std::fs;
+use std::iter;
+
+use meilisearch_core::Database;
+use meilisearch_core::{ProcessedUpdateResult, UpdateStatus};
+use serde_json::Value;
+
+use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
+
+fn prepare_database(path: &Path) -> Database {
+    let database = Database::open_or_create(path).unwrap();
+    let db = &database;
+
+    let (sender, receiver) = mpsc::sync_channel(100);
+    let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
+        sender.send(update.update_id).unwrap()
+    };
+    let index = database.create_index("bench").unwrap();
+
+    database.set_update_callback(Box::new(update_fn));
+
+    let schema = {
+        let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/schema.toml");
+        let string = fs::read_to_string(path).expect("find schema");
+        toml::from_str(&string).unwrap()
+    };
+
+    let mut update_writer = db.update_write_txn().unwrap();
+    let _update_id = index.schema_update(&mut update_writer, schema).unwrap();
+    update_writer.commit().unwrap();
+
+    let mut additions = index.documents_addition();
+
+    let json: Value = {
+        let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/movies.json");
+        let movies_file = fs::File::open(path).expect("find movies");
+        serde_json::from_reader(movies_file).unwrap()
+    };
+
+    let documents = json.as_array().unwrap();
+
+    for document in documents {
+        additions.update_document(document);
+    }
+
+    let mut update_writer = db.update_write_txn().unwrap();
+    let update_id = additions.finalize(&mut update_writer).unwrap();
+    update_writer.commit().unwrap();
+
+    // block until the transaction is processed
+    let _ = receiver.into_iter().find(|id| *id == update_id);
+
+    let update_reader = db.update_read_txn().unwrap();
+    let result = index.update_status(&update_reader, update_id).unwrap();
+    assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
+
+    database
+}
+
+pub fn criterion_benchmark(c: &mut Criterion) {
+    let dir = tempfile::tempdir().unwrap();
+    let database = prepare_database(dir.path());
+
+    let reader = database.main_read_txn().unwrap();
+    let index = database.open_index("bench").unwrap();
+
+    let mut count = 0;
+    let query = "I love paris ";
+
+    let iter = iter::from_fn(|| {
+        count += 1;
+        query.get(0..count)
+    });
+
+    let mut group = c.benchmark_group("searching in movies (19654 docs)");
+    group.sample_size(10);
+
+    for query in iter {
+        let bench_name = BenchmarkId::from_parameter(format!("{:?}", query));
+        group.bench_with_input(bench_name, &query, |b, query| b.iter(|| {
+            let builder = index.query_builder();
+            builder.query(&reader, query, 0..20).unwrap();
+        }));
+    }
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
--- a/meilisearch-core/examples/from_file.rs
+++ b/meilisearch-core/examples/from_file.rs
@ -1,7 +1,7 @@
-use std::collections::btree_map::{BTreeMap, Entry};
 use std::collections::HashSet;
+use std::collections::btree_map::{BTreeMap, Entry};
 use std::error::Error;
-use std::io::Write;
+use std::io::{Read, Write};
 use std::iter::FromIterator;
 use std::path::{Path, PathBuf};
 use std::time::{Duration, Instant};
@ -15,19 +15,23 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
 use meilisearch_core::{Database, Highlight, ProcessedUpdateResult};
 use meilisearch_schema::SchemaAttr;

+// #[cfg(target_os = "linux")]
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
 #[derive(Debug, StructOpt)]
 struct IndexCommand {
    /// The destination where the database must be created.
    #[structopt(parse(from_os_str))]
    database_path: PathBuf,

-    #[structopt(long, default_value = "default")]
-    index_uid: String,
-
    /// The csv file to index.
    #[structopt(parse(from_os_str))]
    csv_data_path: PathBuf,

+    #[structopt(long, default_value = "default")]
+    index_uid: String,
+
    /// The path to the schema.
    #[structopt(long, parse(from_os_str))]
    schema: PathBuf,
@ -135,7 +139,13 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
        }
    }

-    let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
+    let mut rdr = if command.csv_data_path.as_os_str() == "-" {
+        csv::Reader::from_reader(Box::new(io::stdin()) as Box<dyn Read>)
+    } else {
+        let file = std::fs::File::open(command.csv_data_path)?;
+        csv::Reader::from_reader(Box::new(file) as Box<dyn Read>)
+    };
+
    let mut raw_record = csv::StringRecord::new();
    let headers = rdr.headers()?.clone();

--- a/meilisearch-core/src/automaton/dfa.rs
+++ b/meilisearch-core/src/automaton/dfa.rs
@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA {
 pub fn build_dfa(query: &str) -> DFA {
    build_dfa_with_setting(query, PrefixSetting::NoPrefix)
 }
+
+pub fn build_exact_dfa(query: &str) -> DFA {
+    let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
+    builder.build_dfa(query)
+}
--- a/meilisearch-core/src/automaton/mod.rs
+++ b/meilisearch-core/src/automaton/mod.rs
@ -1,125 +1,13 @@
 mod dfa;
 mod query_enhancer;

-use std::cmp::Reverse;
-use std::{cmp, vec};
+use meilisearch_tokenizer::is_cjk;

-use fst::{IntoStreamer, Streamer};
-use levenshtein_automata::DFA;
-use meilisearch_tokenizer::{is_cjk, split_query_string};
-
-use crate::database::MainT;
-use crate::error::MResult;
-use crate::store;
-
-use self::dfa::{build_dfa, build_prefix_dfa};
+pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
 pub use self::query_enhancer::QueryEnhancer;
-use self::query_enhancer::QueryEnhancerBuilder;
+pub use self::query_enhancer::QueryEnhancerBuilder;

-const NGRAMS: usize = 3;
-
-pub struct AutomatonProducer {
-    automatons: Vec<AutomatonGroup>,
-}
-
-impl AutomatonProducer {
-    pub fn new(
-        reader: &heed::RoTxn<MainT>,
-        query: &str,
-        main_store: store::Main,
-        postings_list_store: store::PostingsLists,
-        synonyms_store: store::Synonyms,
-    ) -> MResult<(AutomatonProducer, QueryEnhancer)> {
-        let (automatons, query_enhancer) = generate_automatons(
-            reader,
-            query,
-            main_store,
-            postings_list_store,
-            synonyms_store,
-        )?;
-
-        Ok((AutomatonProducer { automatons }, query_enhancer))
-    }
-
-    pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
-        self.automatons.into_iter()
-    }
-}
-
-#[derive(Debug)]
-pub struct AutomatonGroup {
-    pub is_phrase_query: bool,
-    pub automatons: Vec<Automaton>,
-}
-
-impl AutomatonGroup {
-    fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
-        AutomatonGroup {
-            is_phrase_query: false,
-            automatons,
-        }
-    }
-
-    fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
-        AutomatonGroup {
-            is_phrase_query: true,
-            automatons,
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct Automaton {
-    pub index: usize,
-    pub ngram: usize,
-    pub query_len: usize,
-    pub is_exact: bool,
-    pub is_prefix: bool,
-    pub query: String,
-}
-
-impl Automaton {
-    pub fn dfa(&self) -> DFA {
-        if self.is_prefix {
-            build_prefix_dfa(&self.query)
-        } else {
-            build_dfa(&self.query)
-        }
-    }
-
-    fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
-        Automaton {
-            index,
-            ngram,
-            query_len: query.len(),
-            is_exact: true,
-            is_prefix: false,
-            query: query.to_string(),
-        }
-    }
-
-    fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
-        Automaton {
-            index,
-            ngram,
-            query_len: query.len(),
-            is_exact: true,
-            is_prefix: true,
-            query: query.to_string(),
-        }
-    }
-
-    fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
-        Automaton {
-            index,
-            ngram,
-            query_len: query.len(),
-            is_exact: false,
-            is_prefix: false,
-            query: query.to_string(),
-        }
-    }
-}
+pub const NGRAMS: usize = 3;

 pub fn normalize_str(string: &str) -> String {
    let mut string = string.to_lowercase();
@ -130,167 +18,3 @@ pub fn normalize_str(string: &str) -> String {

    string
 }
-
-fn split_best_frequency<'a>(
-    reader: &heed::RoTxn<MainT>,
-    word: &'a str,
-    postings_lists_store: store::PostingsLists,
-) -> MResult<Option<(&'a str, &'a str)>> {
-    let chars = word.char_indices().skip(1);
-    let mut best = None;
-
-    for (i, _) in chars {
-        let (left, right) = word.split_at(i);
-
-        let left_freq = postings_lists_store
-            .postings_list(reader, left.as_ref())?
-            .map_or(0, |i| i.len());
-
-        let right_freq = postings_lists_store
-            .postings_list(reader, right.as_ref())?
-            .map_or(0, |i| i.len());
-
-        let min_freq = cmp::min(left_freq, right_freq);
-        if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
-            best = Some((min_freq, left, right));
-        }
-    }
-
-    Ok(best.map(|(_, l, r)| (l, r)))
-}
-
-fn generate_automatons(
-    reader: &heed::RoTxn<MainT>,
-    query: &str,
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    synonym_store: store::Synonyms,
-) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
-    let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
-    let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
-    let synonyms = match main_store.synonyms_fst(reader)? {
-        Some(synonym) => synonym,
-        None => fst::Set::default(),
-    };
-
-    let mut automaton_index = 0;
-    let mut automatons = Vec::new();
-    let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
-
-    // We must not declare the original words to the query enhancer
-    // *but* we need to push them in the automatons list first
-    let mut original_automatons = Vec::new();
-    let mut original_words = query_words.iter().peekable();
-    while let Some(word) = original_words.next() {
-        let has_following_word = original_words.peek().is_some();
-        let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
-
-        let automaton = if not_prefix_dfa {
-            Automaton::exact(automaton_index, 1, word)
-        } else {
-            Automaton::prefix_exact(automaton_index, 1, word)
-        };
-        automaton_index += 1;
-        original_automatons.push(automaton);
-    }
-
-    automatons.push(AutomatonGroup::normal(original_automatons));
-
-    for n in 1..=NGRAMS {
-        let mut ngrams = query_words.windows(n).enumerate().peekable();
-        while let Some((query_index, ngram_slice)) = ngrams.next() {
-            let query_range = query_index..query_index + n;
-            let ngram_nb_words = ngram_slice.len();
-            let ngram = ngram_slice.join(" ");
-
-            let has_following_word = ngrams.peek().is_some();
-            let not_prefix_dfa =
-                has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
-
-            // automaton of synonyms of the ngrams
-            let normalized = normalize_str(&ngram);
-            let lev = if not_prefix_dfa {
-                build_dfa(&normalized)
-            } else {
-                build_prefix_dfa(&normalized)
-            };
-
-            let mut stream = synonyms.search(&lev).into_stream();
-            while let Some(base) = stream.next() {
-                // only trigger alternatives when the last word has been typed
-                // i.e. "new " do not but "new yo" triggers alternatives to "new york"
-                let base = std::str::from_utf8(base).unwrap();
-                let base_nb_words = split_query_string(base).count();
-                if ngram_nb_words != base_nb_words {
-                    continue;
-                }
-
-                if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
-                    let mut stream = synonyms.into_stream();
-                    while let Some(synonyms) = stream.next() {
-                        let synonyms = std::str::from_utf8(synonyms).unwrap();
-                        let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
-                        let nb_synonym_words = synonyms_words.len();
-
-                        let real_query_index = automaton_index;
-                        enhancer_builder.declare(
-                            query_range.clone(),
-                            real_query_index,
-                            &synonyms_words,
-                        );
-
-                        for synonym in synonyms_words {
-                            let automaton = if nb_synonym_words == 1 {
-                                Automaton::exact(automaton_index, n, synonym)
-                            } else {
-                                Automaton::non_exact(automaton_index, n, synonym)
-                            };
-                            automaton_index += 1;
-                            automatons.push(AutomatonGroup::normal(vec![automaton]));
-                        }
-                    }
-                }
-            }
-
-            if n == 1 {
-                if let Some((left, right)) =
-                    split_best_frequency(reader, &normalized, postings_lists_store)?
-                {
-                    let a = Automaton::exact(automaton_index, 1, left);
-                    enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
-                    automaton_index += 1;
-
-                    let b = Automaton::exact(automaton_index, 1, right);
-                    enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
-                    automaton_index += 1;
-
-                    automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
-                }
-            } else {
-                // automaton of concatenation of query words
-                let concat = ngram_slice.concat();
-                let normalized = normalize_str(&concat);
-
-                let real_query_index = automaton_index;
-                enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
-
-                let automaton = Automaton::exact(automaton_index, n, &normalized);
-                automaton_index += 1;
-                automatons.push(AutomatonGroup::normal(vec![automaton]));
-            }
-        }
-    }
-
-    // order automatons, the most important first,
-    // we keep the original automatons at the front.
-    automatons[1..].sort_by_key(|group| {
-        let a = group.automatons.first().unwrap();
-        (
-            Reverse(a.is_exact),
-            a.ngram,
-            Reverse(group.automatons.len()),
-        )
-    });
-
-    Ok((automatons, enhancer_builder.build()))
-}
--- a/meilisearch-core/src/automaton/query_enhancer.rs
+++ b/meilisearch-core/src/automaton/query_enhancer.rs
@ -58,6 +58,7 @@ where
 type Origin = usize;
 type RealLength = usize;

+#[derive(Debug)]
 struct FakeIntervalTree {
    intervals: Vec<(Range<usize>, (Origin, RealLength))>,
 }
@ -142,67 +143,80 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
        // we need to pad real query indices
        let real_range = real..real + replacement.len().max(range.len());
        let real_length = replacement.len();
-        self.real_to_origin
-            .push((real_range, (range.start, real_length)));
+        self.real_to_origin.push((real_range, (range.start, real_length)));
    }

    pub fn build(self) -> QueryEnhancer {
-        QueryEnhancer {
-            origins: self.origins,
-            real_to_origin: FakeIntervalTree::new(self.real_to_origin),
+        let interval_tree = FakeIntervalTree::new(self.real_to_origin);
+        let mut table = Vec::new();
+
+        for real in 0.. {
+            match replacement(&self.origins, &interval_tree, real) {
+                Some(range) => table.push(range),
+                None => break,
+            }
        }
+
+        QueryEnhancer { table }
    }
 }

+/// Returns the query indices that represent this real query index.
+fn replacement(
+    origins: &[usize],
+    real_to_origin: &FakeIntervalTree,
+    real: u32,
+) -> Option<Range<u32>>
+{
+    let real = real as usize;
+
+    // query the fake interval tree with the real query index
+    let (range, (origin, real_length)) = real_to_origin.query(real)?;
+
+    // if `real` is the end bound of the range
+    if (range.start + real_length - 1) == real {
+        let mut count = range.len();
+        let mut new_origin = origin;
+        for (i, slice) in origins[new_origin..].windows(2).enumerate() {
+            let len = slice[1] - slice[0];
+            count = count.saturating_sub(len);
+            if count == 0 {
+                new_origin = origin + i;
+                break;
+            }
+        }
+
+        let n = real - range.start;
+        let start = origins[origin];
+        let end = origins.get(new_origin + 1)?;
+        let remaining = (end - start) - n;
+
+        Some(Range {
+            start: (start + n) as u32,
+            end: (start + n + remaining) as u32,
+        })
+    } else {
+        // just return the origin along with
+        // the real position of the word
+        let n = real as usize - range.start;
+        let origin = origins[origin];
+
+        Some(Range {
+            start: (origin + n) as u32,
+            end: (origin + n + 1) as u32,
+        })
+    }
+}
+
+#[derive(Debug)]
 pub struct QueryEnhancer {
-    origins: Vec<usize>,
-    real_to_origin: FakeIntervalTree,
+    table: Vec<Range<u32>>,
 }

 impl QueryEnhancer {
-    /// Returns the query indices to use to replace this real query index.
+    /// Returns the query indices that represent this real query index.
    pub fn replacement(&self, real: u32) -> Range<u32> {
-        let real = real as usize;
-
-        // query the fake interval tree with the real query index
-        let (range, (origin, real_length)) = self
-            .real_to_origin
-            .query(real)
-            .expect("real has never been declared");
-
-        // if `real` is the end bound of the range
-        if (range.start + real_length - 1) == real {
-            let mut count = range.len();
-            let mut new_origin = origin;
-            for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
-                let len = slice[1] - slice[0];
-                count = count.saturating_sub(len);
-                if count == 0 {
-                    new_origin = origin + i;
-                    break;
-                }
-            }
-
-            let n = real - range.start;
-            let start = self.origins[origin];
-            let end = self.origins[new_origin + 1];
-            let remaining = (end - start) - n;
-
-            Range {
-                start: (start + n) as u32,
-                end: (start + n + remaining) as u32,
-            }
-        } else {
-            // just return the origin along with
-            // the real position of the word
-            let n = real as usize - range.start;
-            let origin = self.origins[origin];
-
-            Range {
-                start: (origin + n) as u32,
-                end: (origin + n + 1) as u32,
-            }
-        }
+        self.table[real as usize].clone()
    }
 }

--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@ -0,0 +1,711 @@
+use std::ops::Deref;
+use std::{cmp, fmt};
+use std::borrow::Cow;
+use std::mem;
+use std::ops::Range;
+use std::rc::Rc;
+use std::time::{Duration, Instant};
+
+use compact_arena::{SmallArena, Idx32, mk_arena};
+use fst::{IntoStreamer, Streamer};
+use hashbrown::HashMap;
+use levenshtein_automata::DFA;
+use log::debug;
+use meilisearch_tokenizer::{is_cjk, split_query_string};
+use meilisearch_types::DocIndex;
+use sdset::{Set, SetBuf};
+use slice_group_by::{GroupBy, GroupByMut};
+
+use crate::automaton::NGRAMS;
+use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
+use crate::automaton::normalize_str;
+use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
+
+use crate::criterion::{Criteria, Context, ContextMut};
+use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
+use crate::raw_document::RawDocument;
+use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
+use crate::{store, Document, DocumentId, MResult};
+
+pub fn bucket_sort<'c, FI>(
+    reader: &heed::RoTxn<MainT>,
+    query: &str,
+    range: Range<usize>,
+    filter: Option<FI>,
+    criteria: Criteria<'c>,
+    main_store: store::Main,
+    postings_lists_store: store::PostingsLists,
+    documents_fields_counts_store: store::DocumentsFieldsCounts,
+    synonyms_store: store::Synonyms,
+) -> MResult<Vec<Document>>
+where
+    FI: Fn(DocumentId) -> bool,
+{
+    // We delegate the filter work to the distinct query builder,
+    // specifying a distinct rule that has no effect.
+    if filter.is_some() {
+        let distinct = |_| None;
+        let distinct_size = 1;
+        return bucket_sort_with_distinct(
+            reader,
+            query,
+            range,
+            filter,
+            distinct,
+            distinct_size,
+            criteria,
+            main_store,
+            postings_lists_store,
+            documents_fields_counts_store,
+            synonyms_store,
+        );
+    }
+
+    let (mut automatons, mut query_enhancer) =
+        construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
+
+    debug!("{:?}", query_enhancer);
+
+    let before_postings_lists_fetching = Instant::now();
+    mk_arena!(arena);
+    let mut bare_matches =
+        fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
+    debug!("bare matches ({}) retrieved in {:.02?}",
+        bare_matches.len(),
+        before_postings_lists_fetching.elapsed(),
+    );
+
+    let before_raw_documents_presort = Instant::now();
+    bare_matches.sort_unstable_by_key(|sm| sm.document_id);
+    debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
+
+    let before_raw_documents_building = Instant::now();
+    let mut prefiltered_documents = 0;
+    let mut raw_documents = Vec::new();
+    for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
+        prefiltered_documents += 1;
+        if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) {
+            raw_documents.push(raw_document);
+        }
+    }
+    debug!("creating {} (original {}) candidates documents took {:.02?}",
+        raw_documents.len(),
+        prefiltered_documents,
+        before_raw_documents_building.elapsed(),
+    );
+
+    let mut groups = vec![raw_documents.as_mut_slice()];
+
+    'criteria: for criterion in criteria.as_ref() {
+        let tmp_groups = mem::replace(&mut groups, Vec::new());
+        let mut documents_seen = 0;
+
+        for mut group in tmp_groups {
+            let before_criterion_preparation = Instant::now();
+
+            let ctx = ContextMut {
+                postings_lists: &mut arena,
+                query_enhancer: &mut query_enhancer,
+                automatons: &mut automatons,
+            };
+
+            criterion.prepare(ctx, &mut group);
+            debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
+
+            let ctx = Context {
+                postings_lists: &arena,
+                query_enhancer: &query_enhancer,
+                automatons: &automatons,
+            };
+
+            let before_criterion_sort = Instant::now();
+            group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
+            debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
+
+            for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
+                debug!("{:?} produced a group of size {}", criterion.name(), group.len());
+
+                documents_seen += group.len();
+                groups.push(group);
+
+                // we have sort enough documents if the last document sorted is after
+                // the end of the requested range, we can continue to the next criterion
+                if documents_seen >= range.end {
+                    continue 'criteria;
+                }
+            }
+        }
+    }
+
+    let iter = raw_documents.into_iter().skip(range.start).take(range.len());
+    let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena));
+
+    Ok(iter.collect())
+}
+
+pub fn bucket_sort_with_distinct<'c, FI, FD>(
+    reader: &heed::RoTxn<MainT>,
+    query: &str,
+    range: Range<usize>,
+    filter: Option<FI>,
+    distinct: FD,
+    distinct_size: usize,
+    criteria: Criteria<'c>,
+    main_store: store::Main,
+    postings_lists_store: store::PostingsLists,
+    documents_fields_counts_store: store::DocumentsFieldsCounts,
+    synonyms_store: store::Synonyms,
+) -> MResult<Vec<Document>>
+where
+    FI: Fn(DocumentId) -> bool,
+    FD: Fn(DocumentId) -> Option<u64>,
+{
+    let (mut automatons, mut query_enhancer) =
+        construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
+
+    let before_postings_lists_fetching = Instant::now();
+    mk_arena!(arena);
+    let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
+    debug!("bare matches ({}) retrieved in {:.02?}",
+        bare_matches.len(),
+        before_postings_lists_fetching.elapsed(),
+    );
+
+    let before_raw_documents_presort = Instant::now();
+    bare_matches.sort_unstable_by_key(|sm| sm.document_id);
+    debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
+
+    let before_raw_documents_building = Instant::now();
+    let mut prefiltered_documents = 0;
+    let mut raw_documents = Vec::new();
+    for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
+        prefiltered_documents += 1;
+        if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) {
+            raw_documents.push(raw_document);
+        }
+    }
+    debug!("creating {} (original {}) candidates documents took {:.02?}",
+        raw_documents.len(),
+        prefiltered_documents,
+        before_raw_documents_building.elapsed(),
+    );
+
+    let mut groups = vec![raw_documents.as_mut_slice()];
+    let mut key_cache = HashMap::new();
+
+    let mut filter_map = HashMap::new();
+    // these two variables informs on the current distinct map and
+    // on the raw offset of the start of the group where the
+    // range.start bound is located according to the distinct function
+    let mut distinct_map = DistinctMap::new(distinct_size);
+    let mut distinct_raw_offset = 0;
+
+    'criteria: for criterion in criteria.as_ref() {
+        let tmp_groups = mem::replace(&mut groups, Vec::new());
+        let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
+        let mut documents_seen = 0;
+
+        for mut group in tmp_groups {
+            // if this group does not overlap with the requested range,
+            // push it without sorting and splitting it
+            if documents_seen + group.len() < distinct_raw_offset {
+                documents_seen += group.len();
+                groups.push(group);
+                continue;
+            }
+
+            let ctx = ContextMut {
+                postings_lists: &mut arena,
+                query_enhancer: &mut query_enhancer,
+                automatons: &mut automatons,
+            };
+
+            let before_criterion_preparation = Instant::now();
+            criterion.prepare(ctx, &mut group);
+            debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
+
+            let ctx = Context {
+                postings_lists: &arena,
+                query_enhancer: &query_enhancer,
+                automatons: &automatons,
+            };
+
+            let before_criterion_sort = Instant::now();
+            group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
+            debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
+
+            for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
+                // we must compute the real distinguished len of this sub-group
+                for document in group.iter() {
+                    let filter_accepted = match &filter {
+                        Some(filter) => {
+                            let entry = filter_map.entry(document.id);
+                            *entry.or_insert_with(|| (filter)(document.id))
+                        }
+                        None => true,
+                    };
+
+                    if filter_accepted {
+                        let entry = key_cache.entry(document.id);
+                        let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
+
+                        match key.clone() {
+                            Some(key) => buf_distinct.register(key),
+                            None => buf_distinct.register_without_key(),
+                        };
+                    }
+
+                    // the requested range end is reached: stop computing distinct
+                    if buf_distinct.len() >= range.end {
+                        break;
+                    }
+                }
+
+                documents_seen += group.len();
+                groups.push(group);
+
+                // if this sub-group does not overlap with the requested range
+                // we must update the distinct map and its start index
+                if buf_distinct.len() < range.start {
+                    buf_distinct.transfert_to_internal();
+                    distinct_raw_offset = documents_seen;
+                }
+
+                // we have sort enough documents if the last document sorted is after
+                // the end of the requested range, we can continue to the next criterion
+                if buf_distinct.len() >= range.end {
+                    continue 'criteria;
+                }
+            }
+        }
+    }
+
+    // once we classified the documents related to the current
+    // automatons we save that as the next valid result
+    let mut seen = BufferedDistinctMap::new(&mut distinct_map);
+
+    let mut documents = Vec::with_capacity(range.len());
+    for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) {
+        let filter_accepted = match &filter {
+            Some(_) => filter_map.remove(&raw_document.id).unwrap(),
+            None => true,
+        };
+
+        if filter_accepted {
+            let key = key_cache.remove(&raw_document.id).unwrap();
+            let distinct_accepted = match key {
+                Some(key) => seen.register(key),
+                None => seen.register_without_key(),
+            };
+
+            if distinct_accepted && seen.len() > range.start {
+                documents.push(Document::from_raw(raw_document, &automatons, &arena));
+                if documents.len() == range.len() {
+                    break;
+                }
+            }
+        }
+    }
+
+    Ok(documents)
+}
+
+pub struct BareMatch<'tag> {
+    pub document_id: DocumentId,
+    pub query_index: u16,
+    pub distance: u8,
+    pub is_exact: bool,
+    pub postings_list: Idx32<'tag>,
+}
+
+impl fmt::Debug for BareMatch<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("BareMatch")
+            .field("document_id", &self.document_id)
+            .field("query_index", &self.query_index)
+            .field("distance", &self.distance)
+            .field("is_exact", &self.is_exact)
+            .finish()
+    }
+}
+
+// TODO remove that
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct SimpleMatch {
+    pub query_index: u16,
+    pub distance: u8,
+    pub attribute: u16,
+    pub word_index: u16,
+    pub is_exact: bool,
+}
+
+#[derive(Clone)]
+pub enum PostingsListView<'txn> {
+    Original {
+        input: Rc<[u8]>,
+        postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
+        offset: usize,
+        len: usize,
+    },
+    Rewritten {
+        input: Rc<[u8]>,
+        postings_list: SetBuf<DocIndex>,
+    },
+}
+
+impl fmt::Debug for PostingsListView<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("PostingsListView")
+            .field("input", &std::str::from_utf8(&self.input()).unwrap())
+            .field("postings_list", &self.as_ref())
+            .finish()
+    }
+}
+
+impl<'txn> PostingsListView<'txn> {
+    pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
+        let len = postings_list.len();
+        PostingsListView::Original { input, postings_list, offset: 0, len }
+    }
+
+    pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
+        PostingsListView::Rewritten { input, postings_list }
+    }
+
+    pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
+        let input = match self {
+            PostingsListView::Original { input, .. } => input.clone(),
+            PostingsListView::Rewritten { input, .. } => input.clone(),
+        };
+        *self = PostingsListView::rewritten(input, postings_list);
+    }
+
+    pub fn len(&self) -> usize {
+        match self {
+            PostingsListView::Original { len, .. } => *len,
+            PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
+        }
+    }
+
+    pub fn input(&self) -> &[u8] {
+        match self {
+            PostingsListView::Original { ref input, .. } => input,
+            PostingsListView::Rewritten { ref input, .. } => input,
+        }
+    }
+
+    pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
+        match self {
+            PostingsListView::Original { input, postings_list, offset, len } => {
+                assert!(range_offset + range_len <= *len);
+                PostingsListView::Original {
+                    input: input.clone(),
+                    postings_list: postings_list.clone(),
+                    offset: offset + range_offset,
+                    len: range_len,
+                }
+            },
+            PostingsListView::Rewritten { .. } => {
+                panic!("Cannot create a range on a rewritten postings list view");
+            }
+        }
+    }
+}
+
+impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
+    fn as_ref(&self) -> &Set<DocIndex> {
+        self
+    }
+}
+
+impl Deref for PostingsListView<'_> {
+    type Target = Set<DocIndex>;
+
+    fn deref(&self) -> &Set<DocIndex> {
+        match *self {
+            PostingsListView::Original { ref postings_list, offset, len, .. } => {
+                Set::new_unchecked(&postings_list[offset..offset + len])
+            },
+            PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
+        }
+    }
+}
+
+fn fetch_matches<'txn, 'tag>(
+    reader: &'txn heed::RoTxn<MainT>,
+    automatons: &[QueryWordAutomaton],
+    arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
+    main_store: store::Main,
+    postings_lists_store: store::PostingsLists,
+) -> MResult<Vec<BareMatch<'tag>>>
+{
+    let before_words_fst = Instant::now();
+    let words = match main_store.words_fst(reader)? {
+        Some(words) => words,
+        None => return Ok(Vec::new()),
+    };
+    debug!("words fst took {:.02?}", before_words_fst.elapsed());
+
+    let mut total_postings_lists = Vec::new();
+
+    let mut dfa_time = Duration::default();
+    let mut stream_next_time = Duration::default();
+    let mut postings_lists_fetching_time = Duration::default();
+
+    for (query_index, automaton) in automatons.iter().enumerate() {
+        let before_dfa = Instant::now();
+        let dfa = automaton.dfa();
+        let QueryWordAutomaton { query, is_exact, .. } = automaton;
+        dfa_time += before_dfa.elapsed();
+
+        let mut number_of_words = 0;
+        let mut stream = words.search(&dfa).into_stream();
+
+        // while let Some(input) = stream.next() {
+        loop {
+            let before_stream_next = Instant::now();
+            let input = match stream.next() {
+                Some(input) => input,
+                None => break,
+            };
+            stream_next_time += before_stream_next.elapsed();
+
+            number_of_words += 1;
+
+            let distance = dfa.eval(input).to_u8();
+            let is_exact = *is_exact && distance == 0 && input.len() == query.len();
+
+            let before_postings_lists_fetching = Instant::now();
+            if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
+
+                let input = Rc::from(input);
+                let postings_list = Rc::new(postings_list);
+                let postings_list_view = PostingsListView::original(input, postings_list);
+
+                let mut offset = 0;
+                for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
+
+                    let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
+                    let document_id = group[0].document_id;
+                    let bare_match = BareMatch {
+                        document_id,
+                        query_index: query_index as u16,
+                        distance,
+                        is_exact,
+                        postings_list: posting_list_index,
+                    };
+
+                    total_postings_lists.push(bare_match);
+                    offset += group.len();
+                }
+            }
+            postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
+        }
+
+        debug!("{:?} gives {} words", query, number_of_words);
+    }
+
+    debug!("stream next took {:.02?}", stream_next_time);
+    debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
+    debug!("dfa creation took {:.02?}", dfa_time);
+
+    Ok(total_postings_lists)
+}
+
+#[derive(Debug)]
+pub struct QueryWordAutomaton {
+    pub query: String,
+    /// Is it a word that must be considered exact
+    /// or is it some derived word (i.e. a synonym)
+    pub is_exact: bool,
+    pub is_prefix: bool,
+    /// If it's a phrase query and what is
+    /// its index an the length of the phrase
+    pub phrase_query: Option<(u16, u16)>,
+}
+
+impl QueryWordAutomaton {
+    pub fn exact(query: &str) -> QueryWordAutomaton {
+        QueryWordAutomaton {
+            query: query.to_string(),
+            is_exact: true,
+            is_prefix: false,
+            phrase_query: None,
+        }
+    }
+
+    pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
+        QueryWordAutomaton {
+            query: query.to_string(),
+            is_exact: true,
+            is_prefix: true,
+            phrase_query: None,
+        }
+    }
+
+    pub fn non_exact(query: &str) -> QueryWordAutomaton {
+        QueryWordAutomaton {
+            query: query.to_string(),
+            is_exact: false,
+            is_prefix: false,
+            phrase_query: None,
+        }
+    }
+
+    pub fn dfa(&self) -> DFA {
+        if self.phrase_query.is_some() {
+            build_exact_dfa(&self.query)
+        } else if self.is_prefix {
+            build_prefix_dfa(&self.query)
+        } else {
+            build_dfa(&self.query)
+        }
+    }
+}
+
+fn split_best_frequency<'a>(
+    reader: &heed::RoTxn<MainT>,
+    word: &'a str,
+    postings_lists_store: store::PostingsLists,
+) -> MResult<Option<(&'a str, &'a str)>> {
+    let chars = word.char_indices().skip(1);
+    let mut best = None;
+
+    for (i, _) in chars {
+        let (left, right) = word.split_at(i);
+
+        let left_freq = postings_lists_store
+            .postings_list(reader, left.as_ref())?
+            .map_or(0, |i| i.len());
+
+        let right_freq = postings_lists_store
+            .postings_list(reader, right.as_ref())?
+            .map_or(0, |i| i.len());
+
+        let min_freq = cmp::min(left_freq, right_freq);
+        if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
+            best = Some((min_freq, left, right));
+        }
+    }
+
+    Ok(best.map(|(_, l, r)| (l, r)))
+}
+
+fn construct_automatons(
+    reader: &heed::RoTxn<MainT>,
+    query: &str,
+    main_store: store::Main,
+    postings_lists_store: store::PostingsLists,
+    synonym_store: store::Synonyms,
+) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
+    let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
+    let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
+    let synonyms = match main_store.synonyms_fst(reader)? {
+        Some(synonym) => synonym,
+        None => fst::Set::default(),
+    };
+
+    let mut automaton_index = 0;
+    let mut automatons = Vec::new();
+    let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
+
+    // We must not declare the original words to the query enhancer
+    // *but* we need to push them in the automatons list first
+    let mut original_words = query_words.iter().peekable();
+    while let Some(word) = original_words.next() {
+        let has_following_word = original_words.peek().is_some();
+        let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
+
+        let automaton = if not_prefix_dfa {
+            QueryWordAutomaton::exact(word)
+        } else {
+            QueryWordAutomaton::exact_prefix(word)
+        };
+        automaton_index += 1;
+        automatons.push(automaton);
+    }
+
+    for n in 1..=NGRAMS {
+        let mut ngrams = query_words.windows(n).enumerate().peekable();
+        while let Some((query_index, ngram_slice)) = ngrams.next() {
+            let query_range = query_index..query_index + n;
+            let ngram_nb_words = ngram_slice.len();
+            let ngram = ngram_slice.join(" ");
+
+            let has_following_word = ngrams.peek().is_some();
+            let not_prefix_dfa =
+                has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
+
+            // automaton of synonyms of the ngrams
+            let normalized = normalize_str(&ngram);
+            let lev = if not_prefix_dfa {
+                build_dfa(&normalized)
+            } else {
+                build_prefix_dfa(&normalized)
+            };
+
+            let mut stream = synonyms.search(&lev).into_stream();
+            while let Some(base) = stream.next() {
+                // only trigger alternatives when the last word has been typed
+                // i.e. "new " do not but "new yo" triggers alternatives to "new york"
+                let base = std::str::from_utf8(base).unwrap();
+                let base_nb_words = split_query_string(base).count();
+                if ngram_nb_words != base_nb_words {
+                    continue;
+                }
+
+                if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
+                    let mut stream = synonyms.into_stream();
+                    while let Some(synonyms) = stream.next() {
+                        let synonyms = std::str::from_utf8(synonyms).unwrap();
+                        let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
+                        let nb_synonym_words = synonyms_words.len();
+
+                        let real_query_index = automaton_index;
+                        enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
+
+                        for synonym in synonyms_words {
+                            let automaton = if nb_synonym_words == 1 {
+                                QueryWordAutomaton::exact(synonym)
+                            } else {
+                                QueryWordAutomaton::non_exact(synonym)
+                            };
+                            automaton_index += 1;
+                            automatons.push(automaton);
+                        }
+                    }
+                }
+            }
+
+            if n == 1 {
+                // automatons for splitted words
+                if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
+                    let mut left_automaton = QueryWordAutomaton::exact(left);
+                    left_automaton.phrase_query = Some((0, 2));
+                    enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
+                    automaton_index += 1;
+                    automatons.push(left_automaton);
+
+                    let mut right_automaton = QueryWordAutomaton::exact(right);
+                    right_automaton.phrase_query = Some((1, 2));
+                    enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
+                    automaton_index += 1;
+                    automatons.push(right_automaton);
+                }
+            } else {
+                // automaton of concatenation of query words
+                let concat = ngram_slice.concat();
+                let normalized = normalize_str(&concat);
+
+                let real_query_index = automaton_index;
+                enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
+
+                let automaton = QueryWordAutomaton::exact(&normalized);
+                automaton_index += 1;
+                automatons.push(automaton);
+            }
+        }
+    }
+
+    Ok((automatons, enhancer_builder.build()))
+}
--- a/meilisearch-core/src/criterion/attribute.rs
+++ b/meilisearch-core/src/criterion/attribute.rs
@ -0,0 +1,35 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::RawDocument;
+use crate::bucket_sort::SimpleMatch;
+use super::{Criterion, Context, ContextMut, prepare_raw_matches};
+
+pub struct Attribute;
+
+impl Criterion for Attribute {
+    fn name(&self) -> &str { "attribute" }
+
+    fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
+        documents: &mut [RawDocument<'r, 'tag>],
+    ) {
+        prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons);
+    }
+
+    fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        #[inline]
+        fn sum_of_attribute(matches: &[SimpleMatch]) -> usize {
+            let mut sum_of_attribute = 0;
+            for group in matches.linear_group_by_key(|bm| bm.query_index) {
+                sum_of_attribute += group[0].attribute as usize;
+            }
+            sum_of_attribute
+        }
+
+        let lhs = sum_of_attribute(&lhs.processed_matches);
+        let rhs = sum_of_attribute(&rhs.processed_matches);
+
+        lhs.cmp(&rhs)
+    }
+}
--- a/meilisearch-core/src/criterion/document_id.rs
+++ b/meilisearch-core/src/criterion/document_id.rs
@ -1,16 +1,17 @@
-use crate::criterion::Criterion;
-use crate::RawDocument;
 use std::cmp::Ordering;
+use compact_arena::SmallArena;
+use crate::RawDocument;
+use super::{Criterion, Context};

-#[derive(Debug, Clone, Copy)]
 pub struct DocumentId;

 impl Criterion for DocumentId {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        lhs.id.cmp(&rhs.id)
-    }
+    fn name(&self) -> &str { "stable document id" }

-    fn name(&self) -> &str {
-        "DocumentId"
+    fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = &lhs.id;
+        let rhs = &rhs.id;
+
+        lhs.cmp(rhs)
    }
 }
--- a/meilisearch-core/src/criterion/exact.rs
+++ b/meilisearch-core/src/criterion/exact.rs
@ -1,132 +1,35 @@
-use std::cmp::Ordering;
-
-use meilisearch_schema::SchemaAttr;
-use sdset::Set;
+use std::cmp::{Ordering, Reverse};
 use slice_group_by::GroupBy;
-
-use crate::criterion::Criterion;
 use crate::RawDocument;
+use crate::bucket_sort::BareMatch;
+use super::{Criterion, Context, ContextMut};

-#[inline]
-fn number_exact_matches(
-    query_index: &[u32],
-    attribute: &[u16],
-    is_exact: &[bool],
-    fields_counts: &Set<(SchemaAttr, u64)>,
-) -> usize {
-    let mut count = 0;
-    let mut index = 0;
-
-    for group in query_index.linear_group() {
-        let len = group.len();
-
-        let mut found_exact = false;
-        for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
-            if *is_exact {
-                found_exact = true;
-                let attr = &attribute[index + pos];
-                if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
-                    let (_, count) = fields_counts[pos];
-                    if count == 1 {
-                        return usize::max_value();
-                    }
-                }
-            }
-        }
-
-        count += found_exact as usize;
-        index += len;
-    }
-
-    count
-}
-
-#[derive(Debug, Clone, Copy)]
 pub struct Exact;

 impl Criterion for Exact {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        let lhs = {
-            let query_index = lhs.query_index();
-            let is_exact = lhs.is_exact();
-            let attribute = lhs.attribute();
-            let fields_counts = &lhs.fields_counts;
+    fn name(&self) -> &str { "exact" }

-            number_exact_matches(query_index, attribute, is_exact, fields_counts)
-        };
+    fn prepare(&self, _ctx: ContextMut, documents: &mut [RawDocument]) {
+        for document in documents {
+            document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
+        }
+    }

-        let rhs = {
-            let query_index = rhs.query_index();
-            let is_exact = rhs.is_exact();
-            let attribute = rhs.attribute();
-            let fields_counts = &rhs.fields_counts;
+    fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        #[inline]
+        fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
+            let mut sum_exact_query_words = 0;

-            number_exact_matches(query_index, attribute, is_exact, fields_counts)
-        };
+            for group in matches.linear_group_by_key(|bm| bm.query_index) {
+                sum_exact_query_words += group[0].is_exact as usize;
+            }
+
+            sum_exact_query_words
+        }
+
+        let lhs = sum_exact_query_words(&lhs.raw_matches);
+        let rhs = sum_exact_query_words(&rhs.raw_matches);

        lhs.cmp(&rhs).reverse()
    }
-
-    fn name(&self) -> &str {
-        "Exact"
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // typing: "soulier"
-    //
-    // doc0: "Soulier bleu"
-    // doc1: "souliereres rouge"
-    #[test]
-    fn easy_case() {
-        let doc0 = {
-            let query_index = &[0];
-            let attribute = &[0];
-            let is_exact = &[true];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
-
-            number_exact_matches(query_index, attribute, is_exact, fields_counts)
-        };
-
-        let doc1 = {
-            let query_index = &[0];
-            let attribute = &[0];
-            let is_exact = &[false];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
-
-            number_exact_matches(query_index, attribute, is_exact, fields_counts)
-        };
-
-        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
-    }
-
-    // typing: "soulier"
-    //
-    // doc0: { 0. "soulier" }
-    // doc1: { 0. "soulier bleu et blanc" }
-    #[test]
-    fn basic() {
-        let doc0 = {
-            let query_index = &[0];
-            let attribute = &[0];
-            let is_exact = &[true];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
-
-            number_exact_matches(query_index, attribute, is_exact, fields_counts)
-        };
-
-        let doc1 = {
-            let query_index = &[0];
-            let attribute = &[0];
-            let is_exact = &[true];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
-
-            number_exact_matches(query_index, attribute, is_exact, fields_counts)
-        };
-
-        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
-    }
 }
--- a/meilisearch-core/src/criterion/mod.rs
+++ b/meilisearch-core/src/criterion/mod.rs
@ -1,59 +1,71 @@
-mod document_id;
-mod exact;
-mod number_of_words;
-mod sort_by_attr;
-mod sum_of_typos;
-mod sum_of_words_attribute;
-mod sum_of_words_position;
-mod words_proximity;
+use std::cmp::{self, Ordering};

+use compact_arena::SmallArena;
+use sdset::SetBuf;
+use slice_group_by::GroupBy;
+
+use crate::automaton::QueryEnhancer;
+use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
 use crate::RawDocument;
-use std::cmp::Ordering;

-pub use self::{
-    document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
-    sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
-    sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
-    words_proximity::WordsProximity,
-};
+mod typo;
+mod words;
+mod proximity;
+mod attribute;
+mod words_position;
+mod exact;
+mod document_id;
+mod sort_by_attr;

-pub trait Criterion: Send + Sync {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
+pub use self::typo::Typo;
+pub use self::words::Words;
+pub use self::proximity::Proximity;
+pub use self::attribute::Attribute;
+pub use self::words_position::WordsPosition;
+pub use self::exact::Exact;
+pub use self::document_id::DocumentId;
+pub use self::sort_by_attr::SortByAttr;

+pub trait Criterion {
    fn name(&self) -> &str;

+    fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
+        documents: &mut [RawDocument<'r, 'tag>],
+    ) {
+        /* ... */
+    }
+
+    fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
+        lhs: &RawDocument<'r, 'tag>,
+        rhs: &RawDocument<'r, 'tag>,
+    ) -> Ordering;
+
    #[inline]
-    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
-        self.evaluate(lhs, rhs) == Ordering::Equal
+    fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
+        lhs: &RawDocument<'r, 'tag>,
+        rhs: &RawDocument<'r, 'tag>,
+    ) -> bool
+    {
+        self.evaluate(ctx, lhs, rhs) == Ordering::Equal
    }
 }

-impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        (**self).evaluate(lhs, rhs)
-    }
-
-    fn name(&self) -> &str {
-        (**self).name()
-    }
-
-    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
-        (**self).eq(lhs, rhs)
-    }
+pub struct ContextMut<'p, 'tag, 'txn, 'q, 'a> {
+    pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
+    pub query_enhancer: &'q mut QueryEnhancer,
+    pub automatons: &'a mut [QueryWordAutomaton],
 }

-impl<T: Criterion + ?Sized> Criterion for Box<T> {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        (**self).evaluate(lhs, rhs)
-    }
-
-    fn name(&self) -> &str {
-        (**self).name()
-    }
-
-    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
-        (**self).eq(lhs, rhs)
-    }
+pub struct Context<'p, 'tag, 'txn, 'q, 'a> {
+    pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
+    pub query_enhancer: &'q QueryEnhancer,
+    pub automatons: &'a [QueryWordAutomaton],
 }

 #[derive(Default)]
@ -103,11 +115,11 @@ pub struct Criteria<'a> {
 impl<'a> Default for Criteria<'a> {
    fn default() -> Self {
        CriteriaBuilder::with_capacity(7)
-            .add(SumOfTypos)
-            .add(NumberOfWords)
-            .add(WordsProximity)
-            .add(SumOfWordsAttribute)
-            .add(SumOfWordsPosition)
+            .add(Typo)
+            .add(Words)
+            .add(Proximity)
+            .add(Attribute)
+            .add(WordsPosition)
            .add(Exact)
            .add(DocumentId)
            .build()
@ -119,3 +131,165 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
        &self.inner
    }
 }
+
+fn prepare_query_distances<'a, 'tag, 'txn>(
+    documents: &mut [RawDocument<'a, 'tag>],
+    query_enhancer: &QueryEnhancer,
+    automatons: &[QueryWordAutomaton],
+    postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
+) {
+    for document in documents {
+        if !document.processed_distances.is_empty() { continue }
+
+        let mut processed = Vec::new();
+        for m in document.raw_matches.iter() {
+            if postings_lists[m.postings_list].is_empty() { continue }
+
+            let range = query_enhancer.replacement(m.query_index as u32);
+            let new_len = cmp::max(range.end as usize, processed.len());
+            processed.resize(new_len, None);
+
+            for index in range {
+                let index = index as usize;
+                processed[index] = match processed[index] {
+                    Some(distance) if distance > m.distance => Some(m.distance),
+                    Some(distance) => Some(distance),
+                    None => Some(m.distance),
+                };
+            }
+        }
+
+        document.processed_distances = processed;
+    }
+}
+
+fn prepare_raw_matches<'a, 'tag, 'txn>(
+    documents: &mut [RawDocument<'a, 'tag>],
+    postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
+    query_enhancer: &QueryEnhancer,
+    automatons: &[QueryWordAutomaton],
+) {
+    for document in documents {
+        if !document.processed_matches.is_empty() { continue }
+
+        let mut processed = Vec::new();
+        for m in document.raw_matches.iter() {
+            let postings_list = &postings_lists[m.postings_list];
+            processed.reserve(postings_list.len());
+            for di in postings_list.as_ref() {
+                let simple_match = SimpleMatch {
+                    query_index: m.query_index,
+                    distance: m.distance,
+                    attribute: di.attribute,
+                    word_index: di.word_index,
+                    is_exact: m.is_exact,
+                };
+                processed.push(simple_match);
+            }
+        }
+
+        let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons);
+        document.processed_matches = processed.into_vec();
+    }
+}
+
+fn multiword_rewrite_matches(
+    matches: &mut [SimpleMatch],
+    query_enhancer: &QueryEnhancer,
+    automatons: &[QueryWordAutomaton],
+) -> SetBuf<SimpleMatch>
+{
+    matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
+
+    let mut padded_matches = Vec::with_capacity(matches.len());
+
+    // let before_padding = Instant::now();
+    // for each attribute of each document
+    for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
+        // padding will only be applied
+        // to word indices in the same attribute
+        let mut padding = 0;
+        let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
+
+        // for each match at the same position
+        // in this document attribute
+        while let Some(same_word_index) = iter.next() {
+            // find the biggest padding
+            let mut biggest = 0;
+            for match_ in same_word_index {
+                let mut replacement = query_enhancer.replacement(match_.query_index as u32);
+                let replacement_len = replacement.len();
+                let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
+
+                if let Some(query_index) = replacement.next() {
+                    let word_index = match_.word_index + padding as u16;
+                    let query_index = query_index as u16;
+                    let match_ = SimpleMatch { query_index, word_index, ..*match_ };
+                    padded_matches.push(match_);
+                }
+
+                let mut found = false;
+
+                // look ahead and if there already is a match
+                // corresponding to this padding word, abort the padding
+                'padding: for (x, next_group) in nexts.enumerate() {
+                    for (i, query_index) in replacement.clone().enumerate().skip(x) {
+                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                        let query_index = query_index as u16;
+                        let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
+
+                        for nmatch_ in next_group {
+                            let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
+                            let query_index = rep.next().unwrap() as u16;
+                            if query_index == padmatch.query_index {
+                                if !found {
+                                    // if we find a corresponding padding for the
+                                    // first time we must push preceding paddings
+                                    for (i, query_index) in replacement.clone().enumerate().take(i)
+                                    {
+                                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                                        let query_index = query_index as u16;
+                                        let match_ = SimpleMatch { query_index, word_index, ..*match_ };
+                                        padded_matches.push(match_);
+                                        biggest = biggest.max(i + 1);
+                                    }
+                                }
+
+                                padded_matches.push(padmatch);
+                                found = true;
+                                continue 'padding;
+                            }
+                        }
+                    }
+
+                    // if we do not find a corresponding padding in the
+                    // next groups so stop here and pad what was found
+                    break;
+                }
+
+                if !found {
+                    // if no padding was found in the following matches
+                    // we must insert the entire padding
+                    for (i, query_index) in replacement.enumerate() {
+                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                        let query_index = query_index as u16;
+                        let match_ = SimpleMatch { query_index, word_index, ..*match_ };
+                        padded_matches.push(match_);
+                    }
+
+                    biggest = biggest.max(replacement_len - 1);
+                }
+            }
+
+            padding += biggest;
+        }
+    }
+
+    // debug!("padding matches took {:.02?}", before_padding.elapsed());
+
+    // With this check we can see that the loop above takes something
+    // like 43% of the search time even when no rewrite is needed.
+    // assert_eq!(before_matches, padded_matches);
+
+    SetBuf::from_dirty(padded_matches)
+}
--- a/meilisearch-core/src/criterion/number_of_words.rs
+++ b/meilisearch-core/src/criterion/number_of_words.rs
@ -1,31 +0,0 @@
-use crate::criterion::Criterion;
-use crate::RawDocument;
-use slice_group_by::GroupBy;
-use std::cmp::Ordering;
-
-#[inline]
-fn number_of_query_words(query_index: &[u32]) -> usize {
-    query_index.linear_group().count()
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct NumberOfWords;
-
-impl Criterion for NumberOfWords {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        let lhs = {
-            let query_index = lhs.query_index();
-            number_of_query_words(query_index)
-        };
-        let rhs = {
-            let query_index = rhs.query_index();
-            number_of_query_words(query_index)
-        };
-
-        lhs.cmp(&rhs).reverse()
-    }
-
-    fn name(&self) -> &str {
-        "NumberOfWords"
-    }
-}
--- a/meilisearch-core/src/criterion/proximity.rs
+++ b/meilisearch-core/src/criterion/proximity.rs
@ -0,0 +1,66 @@
+use std::cmp::{self, Ordering};
+use slice_group_by::GroupBy;
+use crate::bucket_sort::{SimpleMatch};
+use crate::RawDocument;
+use super::{Criterion, Context, ContextMut, prepare_raw_matches};
+
+const MAX_DISTANCE: u16 = 8;
+
+pub struct Proximity;
+
+impl Criterion for Proximity {
+    fn name(&self) -> &str { "proximity" }
+
+    fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
+        documents: &mut [RawDocument<'r, 'tag>],
+    ) {
+        prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons);
+    }
+
+    fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        fn index_proximity(lhs: u16, rhs: u16) -> u16 {
+            if lhs < rhs {
+                cmp::min(rhs - lhs, MAX_DISTANCE)
+            } else {
+                cmp::min(lhs - rhs, MAX_DISTANCE) + 1
+            }
+        }
+
+        fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
+            if lhs.attribute != rhs.attribute { MAX_DISTANCE }
+            else { index_proximity(lhs.word_index, rhs.word_index) }
+        }
+
+        fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
+            let mut min_prox = u16::max_value();
+            for a in lhs {
+                for b in rhs {
+                    let prox = attribute_proximity(*a, *b);
+                    min_prox = cmp::min(min_prox, prox);
+                }
+            }
+            min_prox
+        }
+
+        fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
+            let mut proximity = 0;
+            let mut iter = matches.linear_group_by_key(|m| m.query_index);
+
+            // iterate over groups by windows of size 2
+            let mut last = iter.next();
+            while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
+                proximity += min_proximity(lhs, rhs);
+                last = Some(rhs);
+            }
+
+            proximity
+        }
+
+        let lhs = matches_proximity(&lhs.processed_matches);
+        let rhs = matches_proximity(&rhs.processed_matches);
+
+        lhs.cmp(&rhs)
+    }
+}
--- a/meilisearch-core/src/criterion/sort_by_attr.rs
+++ b/meilisearch-core/src/criterion/sort_by_attr.rs
@ -1,10 +1,9 @@
 use std::cmp::Ordering;
 use std::error::Error;
 use std::fmt;
-
-use crate::criterion::Criterion;
-use crate::{RankedMap, RawDocument};
 use meilisearch_schema::{Schema, SchemaAttr};
+use crate::{RankedMap, RawDocument};
+use super::{Criterion, Context};

 /// An helper struct that permit to sort documents by
 /// some of their stored attributes.
@ -28,11 +27,11 @@ use meilisearch_schema::{Schema, SchemaAttr};
 /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
 ///
 /// let builder = CriteriaBuilder::with_capacity(8)
-///        .add(SumOfTypos)
-///        .add(NumberOfWords)
-///        .add(WordsProximity)
-///        .add(SumOfWordsAttribute)
-///        .add(SumOfWordsPosition)
+///        .add(Typo)
+///        .add(Words)
+///        .add(Proximity)
+///        .add(Attribute)
+///        .add(WordsPosition)
 ///        .add(Exact)
 ///        .add(custom_ranking)
 ///        .add(DocumentId);
@ -86,8 +85,12 @@ impl<'a> SortByAttr<'a> {
    }
 }

-impl<'a> Criterion for SortByAttr<'a> {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+impl Criterion for SortByAttr<'_> {
+    fn name(&self) -> &str {
+        "sort by attribute"
+    }
+
+    fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
        let lhs = self.ranked_map.get(lhs.id, self.attr);
        let rhs = self.ranked_map.get(rhs.id, self.attr);

@ -105,10 +108,6 @@ impl<'a> Criterion for SortByAttr<'a> {
            (None, None) => Ordering::Equal,
        }
    }
-
-    fn name(&self) -> &str {
-        "SortByAttr"
-    }
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
--- a/meilisearch-core/src/criterion/sum_of_typos.rs
+++ b/meilisearch-core/src/criterion/sum_of_typos.rs
@ -1,116 +0,0 @@
-use std::cmp::Ordering;
-
-use slice_group_by::GroupBy;
-
-use crate::criterion::Criterion;
-use crate::RawDocument;
-
-// This function is a wrong logarithmic 10 function.
-// It is safe to panic on input number higher than 3,
-// the number of typos is never bigger than that.
-#[inline]
-fn custom_log10(n: u8) -> f32 {
-    match n {
-        0 => 0.0,     // log(1)
-        1 => 0.30102, // log(2)
-        2 => 0.47712, // log(3)
-        3 => 0.60205, // log(4)
-        _ => panic!("invalid number"),
-    }
-}
-
-#[inline]
-fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
-    let mut number_words: usize = 0;
-    let mut sum_typos = 0.0;
-    let mut index = 0;
-
-    for group in query_index.linear_group() {
-        sum_typos += custom_log10(distance[index]);
-        number_words += 1;
-        index += group.len();
-    }
-
-    (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct SumOfTypos;
-
-impl Criterion for SumOfTypos {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        let lhs = {
-            let query_index = lhs.query_index();
-            let distance = lhs.distance();
-            sum_matches_typos(query_index, distance)
-        };
-
-        let rhs = {
-            let query_index = rhs.query_index();
-            let distance = rhs.distance();
-            sum_matches_typos(query_index, distance)
-        };
-
-        lhs.cmp(&rhs).reverse()
-    }
-
-    fn name(&self) -> &str {
-        "SumOfTypos"
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // typing: "Geox CEO"
-    //
-    // doc0: "Geox SpA: CEO and Executive"
-    // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
-    #[test]
-    fn one_typo_reference() {
-        let query_index0 = &[0, 1];
-        let distance0 = &[0, 0];
-
-        let query_index1 = &[0, 1];
-        let distance1 = &[1, 0];
-
-        let doc0 = sum_matches_typos(query_index0, distance0);
-        let doc1 = sum_matches_typos(query_index1, distance1);
-        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
-    }
-
-    // typing: "bouton manchette"
-    //
-    // doc0: "bouton manchette"
-    // doc1: "bouton"
-    #[test]
-    fn no_typo() {
-        let query_index0 = &[0, 1];
-        let distance0 = &[0, 0];
-
-        let query_index1 = &[0];
-        let distance1 = &[0];
-
-        let doc0 = sum_matches_typos(query_index0, distance0);
-        let doc1 = sum_matches_typos(query_index1, distance1);
-        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
-    }
-
-    // typing: "bouton manchztte"
-    //
-    // doc0: "bouton manchette"
-    // doc1: "bouton"
-    #[test]
-    fn one_typo() {
-        let query_index0 = &[0, 1];
-        let distance0 = &[0, 1];
-
-        let query_index1 = &[0];
-        let distance1 = &[0];
-
-        let doc0 = sum_matches_typos(query_index0, distance0);
-        let doc1 = sum_matches_typos(query_index1, distance1);
-        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
-    }
-}
--- a/meilisearch-core/src/criterion/sum_of_words_attribute.rs
+++ b/meilisearch-core/src/criterion/sum_of_words_attribute.rs
@ -1,64 +0,0 @@
-use crate::criterion::Criterion;
-use crate::RawDocument;
-use slice_group_by::GroupBy;
-use std::cmp::Ordering;
-
-#[inline]
-fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
-    let mut sum_attributes = 0;
-    let mut index = 0;
-
-    for group in query_index.linear_group() {
-        sum_attributes += attribute[index] as usize;
-        index += group.len();
-    }
-
-    sum_attributes
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct SumOfWordsAttribute;
-
-impl Criterion for SumOfWordsAttribute {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        let lhs = {
-            let query_index = lhs.query_index();
-            let attribute = lhs.attribute();
-            sum_matches_attributes(query_index, attribute)
-        };
-
-        let rhs = {
-            let query_index = rhs.query_index();
-            let attribute = rhs.attribute();
-            sum_matches_attributes(query_index, attribute)
-        };
-
-        lhs.cmp(&rhs)
-    }
-
-    fn name(&self) -> &str {
-        "SumOfWordsAttribute"
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // typing: "soulier"
-    //
-    // doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
-    // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
-    #[test]
-    fn title_vs_description() {
-        let query_index0 = &[0];
-        let attribute0 = &[0];
-
-        let query_index1 = &[0];
-        let attribute1 = &[1];
-
-        let doc0 = sum_matches_attributes(query_index0, attribute0);
-        let doc1 = sum_matches_attributes(query_index1, attribute1);
-        assert_eq!(doc0.cmp(&doc1), Ordering::Less);
-    }
-}
--- a/meilisearch-core/src/criterion/sum_of_words_position.rs
+++ b/meilisearch-core/src/criterion/sum_of_words_position.rs
@ -1,64 +0,0 @@
-use crate::criterion::Criterion;
-use crate::RawDocument;
-use slice_group_by::GroupBy;
-use std::cmp::Ordering;
-
-#[inline]
-fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
-    let mut sum_word_index = 0;
-    let mut index = 0;
-
-    for group in query_index.linear_group() {
-        sum_word_index += word_index[index] as usize;
-        index += group.len();
-    }
-
-    sum_word_index
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct SumOfWordsPosition;
-
-impl Criterion for SumOfWordsPosition {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        let lhs = {
-            let query_index = lhs.query_index();
-            let word_index = lhs.word_index();
-            sum_matches_attribute_index(query_index, word_index)
-        };
-
-        let rhs = {
-            let query_index = rhs.query_index();
-            let word_index = rhs.word_index();
-            sum_matches_attribute_index(query_index, word_index)
-        };
-
-        lhs.cmp(&rhs)
-    }
-
-    fn name(&self) -> &str {
-        "SumOfWordsPosition"
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // typing: "soulier"
-    //
-    // doc0: "Soulier bleu"
-    // doc1: "Botte rouge et soulier noir"
-    #[test]
-    fn easy_case() {
-        let query_index0 = &[0];
-        let word_index0 = &[0];
-
-        let query_index1 = &[0];
-        let word_index1 = &[3];
-
-        let doc0 = sum_matches_attribute_index(query_index0, word_index0);
-        let doc1 = sum_matches_attribute_index(query_index1, word_index1);
-        assert_eq!(doc0.cmp(&doc1), Ordering::Less);
-    }
-}
--- a/meilisearch-core/src/criterion/typo.rs
+++ b/meilisearch-core/src/criterion/typo.rs
@ -0,0 +1,59 @@
+use std::cmp::Ordering;
+
+use compact_arena::SmallArena;
+
+use crate::automaton::QueryEnhancer;
+use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
+use crate::RawDocument;
+
+use super::{Criterion, Context, ContextMut, prepare_query_distances};
+
+pub struct Typo;
+
+impl Criterion for Typo {
+    fn name(&self) -> &str { "typo" }
+
+    fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
+        documents: &mut [RawDocument<'r, 'tag>],
+    ) {
+        prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists);
+    }
+
+    fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        // This function is a wrong logarithmic 10 function.
+        // It is safe to panic on input number higher than 3,
+        // the number of typos is never bigger than that.
+        #[inline]
+        fn custom_log10(n: u8) -> f32 {
+            match n {
+                0 => 0.0,     // log(1)
+                1 => 0.30102, // log(2)
+                2 => 0.47712, // log(3)
+                3 => 0.60205, // log(4)
+                _ => panic!("invalid number"),
+            }
+        }
+
+        #[inline]
+        fn compute_typos(distances: &[Option<u8>]) -> usize {
+            let mut number_words: usize = 0;
+            let mut sum_typos = 0.0;
+
+            for distance in distances {
+                if let Some(distance) = distance {
+                    sum_typos += custom_log10(*distance);
+                    number_words += 1;
+                }
+            }
+
+            (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
+        }
+
+        let lhs = compute_typos(&lhs.processed_distances);
+        let rhs = compute_typos(&rhs.processed_distances);
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
--- a/meilisearch-core/src/criterion/words.rs
+++ b/meilisearch-core/src/criterion/words.rs
@ -0,0 +1,29 @@
+use std::cmp::Ordering;
+use crate::RawDocument;
+use super::{Criterion, Context, ContextMut, prepare_query_distances};
+
+pub struct Words;
+
+impl Criterion for Words {
+    fn name(&self) -> &str { "words" }
+
+    fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
+        documents: &mut [RawDocument<'r, 'tag>],
+    ) {
+        prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists);
+    }
+
+    fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        #[inline]
+        fn number_of_query_words(distances: &[Option<u8>]) -> usize {
+            distances.iter().cloned().filter(Option::is_some).count()
+        }
+
+        let lhs = number_of_query_words(&lhs.processed_distances);
+        let rhs = number_of_query_words(&rhs.processed_distances);
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
--- a/meilisearch-core/src/criterion/words_position.rs
+++ b/meilisearch-core/src/criterion/words_position.rs
@ -0,0 +1,43 @@
+use std::cmp::Ordering;
+
+use slice_group_by::GroupBy;
+
+use crate::RawDocument;
+use crate::bucket_sort::SimpleMatch;
+use super::{Criterion, Context, ContextMut, prepare_raw_matches};
+
+pub struct WordsPosition;
+
+impl Criterion for WordsPosition {
+    fn name(&self) -> &str { "words position" }
+
+    fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
+        documents: &mut [RawDocument<'r, 'tag>],
+    ) {
+        prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons);
+    }
+
+    fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
+        &self,
+        ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
+        lhs: &RawDocument<'r, 'tag>,
+        rhs: &RawDocument<'r, 'tag>,
+    ) -> Ordering
+    {
+        #[inline]
+        fn sum_words_position(matches: &[SimpleMatch]) -> usize {
+            let mut sum_words_position = 0;
+            for group in matches.linear_group_by_key(|bm| bm.query_index) {
+                sum_words_position += group[0].word_index as usize;
+            }
+            sum_words_position
+        }
+
+        let lhs = sum_words_position(&lhs.processed_matches);
+        let rhs = sum_words_position(&rhs.processed_matches);
+
+        lhs.cmp(&rhs)
+    }
+}
--- a/meilisearch-core/src/criterion/words_proximity.rs
+++ b/meilisearch-core/src/criterion/words_proximity.rs
@ -1,164 +0,0 @@
-use crate::criterion::Criterion;
-use crate::RawDocument;
-use slice_group_by::GroupBy;
-use std::cmp::{self, Ordering};
-
-const MAX_DISTANCE: u16 = 8;
-
-#[inline]
-fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
-    (a.clone(), b.clone())
-}
-
-fn index_proximity(lhs: u16, rhs: u16) -> u16 {
-    if lhs < rhs {
-        cmp::min(rhs - lhs, MAX_DISTANCE)
-    } else {
-        cmp::min(lhs - rhs, MAX_DISTANCE) + 1
-    }
-}
-
-fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
-    if lattr != rattr {
-        return MAX_DISTANCE;
-    }
-    index_proximity(lwi, rwi)
-}
-
-fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
-    let mut min_prox = u16::max_value();
-
-    for a in lattr.iter().zip(lwi) {
-        for b in rattr.iter().zip(rwi) {
-            let a = clone_tuple(a);
-            let b = clone_tuple(b);
-            min_prox = cmp::min(min_prox, attribute_proximity(a, b));
-        }
-    }
-
-    min_prox
-}
-
-fn matches_proximity(
-    query_index: &[u32],
-    distance: &[u8],
-    attribute: &[u16],
-    word_index: &[u16],
-) -> u16 {
-    let mut query_index_groups = query_index.linear_group();
-    let mut proximity = 0;
-    let mut index = 0;
-
-    let get_attr_wi = |index: usize, group_len: usize| {
-        // retrieve the first distance group (with the lowest values)
-        let len = distance[index..index + group_len]
-            .linear_group()
-            .next()
-            .unwrap()
-            .len();
-
-        let rattr = &attribute[index..index + len];
-        let rwi = &word_index[index..index + len];
-
-        (rattr, rwi)
-    };
-
-    let mut last = query_index_groups.next().map(|group| {
-        let attr_wi = get_attr_wi(index, group.len());
-        index += group.len();
-        attr_wi
-    });
-
-    // iter by windows of size 2
-    while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
-        let attr_wi = get_attr_wi(index, rhs.len());
-        proximity += min_proximity(lhs, attr_wi);
-        last = Some(attr_wi);
-        index += rhs.len();
-    }
-
-    proximity
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct WordsProximity;
-
-impl Criterion for WordsProximity {
-    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
-        let lhs = {
-            let query_index = lhs.query_index();
-            let distance = lhs.distance();
-            let attribute = lhs.attribute();
-            let word_index = lhs.word_index();
-            matches_proximity(query_index, distance, attribute, word_index)
-        };
-
-        let rhs = {
-            let query_index = rhs.query_index();
-            let distance = rhs.distance();
-            let attribute = rhs.attribute();
-            let word_index = rhs.word_index();
-            matches_proximity(query_index, distance, attribute, word_index)
-        };
-
-        lhs.cmp(&rhs)
-    }
-
-    fn name(&self) -> &str {
-        "WordsProximity"
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn three_different_attributes() {
-        // "soup" "of the" "the day"
-        //
-        // { id: 0, attr: 0, attr_index: 0 }
-        // { id: 1, attr: 1, attr_index: 0 }
-        // { id: 2, attr: 1, attr_index: 1 }
-        // { id: 2, attr: 2, attr_index: 0 }
-        // { id: 3, attr: 3, attr_index: 1 }
-
-        let query_index = &[0, 1, 2, 2, 3];
-        let distance = &[0, 0, 0, 0, 0];
-        let attribute = &[0, 1, 1, 2, 3];
-        let word_index = &[0, 0, 1, 0, 1];
-
-        //   soup -> of = 8
-        // + of -> the  = 1
-        // + the -> day = 8 (not 1)
-        assert_eq!(
-            matches_proximity(query_index, distance, attribute, word_index),
-            17
-        );
-    }
-
-    #[test]
-    fn two_different_attributes() {
-        // "soup day" "soup of the day"
-        //
-        // { id: 0, attr: 0, attr_index: 0 }
-        // { id: 0, attr: 1, attr_index: 0 }
-        // { id: 1, attr: 1, attr_index: 1 }
-        // { id: 2, attr: 1, attr_index: 2 }
-        // { id: 3, attr: 0, attr_index: 1 }
-        // { id: 3, attr: 1, attr_index: 3 }
-
-        let query_index = &[0, 0, 1, 2, 3, 3];
-        let distance = &[0, 0, 0, 0, 0, 0];
-        let attribute = &[0, 1, 1, 1, 0, 1];
-        let word_index = &[0, 0, 1, 2, 1, 3];
-
-        //   soup -> of = 1
-        // + of -> the  = 1
-        // + the -> day = 1
-        assert_eq!(
-            matches_proximity(query_index, distance, attribute, word_index),
-            3
-        );
-    }
-}
--- a/meilisearch-core/src/lib.rs
+++ b/meilisearch-core/src/lib.rs
@ -18,6 +18,9 @@ pub mod serde;
 pub mod store;
 mod update;

+// TODO replace
+mod bucket_sort;
+
 pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT};
 pub use self::error::{Error, MResult};
 pub use self::number::{Number, ParseNumberError};
@ -25,63 +28,48 @@ pub use self::ranked_map::RankedMap;
 pub use self::raw_document::RawDocument;
 pub use self::store::Index;
 pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
-pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
+pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount};

-#[doc(hidden)]
-#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct TmpMatch {
-    pub query_index: u32,
-    pub distance: u8,
-    pub attribute: u16,
-    pub word_index: u16,
-    pub is_exact: bool,
-}
+use compact_arena::SmallArena;
+use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
+use crate::levenshtein::prefix_damerau_levenshtein;

 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Document {
    pub id: DocumentId,
    pub highlights: Vec<Highlight>,

-    #[cfg(test)]
-    pub matches: Vec<TmpMatch>,
+    // #[cfg(test)]
+    // pub matches: Vec<TmpMatch>,
 }

 impl Document {
-    #[cfg(not(test))]
-    fn from_raw(raw: RawDocument) -> Document {
-        Document {
-            id: raw.id,
-            highlights: raw.highlights,
-        }
-    }
+    pub fn from_raw<'a, 'tag, 'txn>(
+        raw_document: RawDocument<'a, 'tag>,
+        automatons: &[QueryWordAutomaton],
+        arena: &SmallArena<'tag, PostingsListView<'txn>>,
+    ) -> Document
+    {
+        let highlights = raw_document.raw_matches.iter().flat_map(|sm| {
+            let postings_list = &arena[sm.postings_list];
+            let input = postings_list.input();
+            let query = &automatons[sm.query_index as usize].query;
+            postings_list.iter().map(move |m| {
+                let covered_area = if query.len() > input.len() {
+                    input.len()
+                } else {
+                    prefix_damerau_levenshtein(query.as_bytes(), input).1
+                };

-    #[cfg(test)]
-    fn from_raw(raw: RawDocument) -> Document {
-        let len = raw.query_index().len();
-        let mut matches = Vec::with_capacity(len);
+                Highlight {
+                    attribute: m.attribute,
+                    char_index: m.char_index,
+                    char_length: covered_area as u16,
+                }
+            })
+        }).collect();

-        let query_index = raw.query_index();
-        let distance = raw.distance();
-        let attribute = raw.attribute();
-        let word_index = raw.word_index();
-        let is_exact = raw.is_exact();
-
-        for i in 0..len {
-            let match_ = TmpMatch {
-                query_index: query_index[i],
-                distance: distance[i],
-                attribute: attribute[i],
-                word_index: word_index[i],
-                is_exact: is_exact[i],
-            };
-            matches.push(match_);
-        }
-
-        Document {
-            id: raw.id,
-            matches,
-            highlights: raw.highlights,
-        }
+        Document { id: raw_document.id, highlights }
    }
 }

--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -1,20 +1,9 @@
-use hashbrown::HashMap;
-use std::convert::TryFrom;
 use std::ops::Range;
-use std::rc::Rc;
-use std::time::{Duration, Instant};
-use std::{cmp, mem};
-
-use fst::{IntoStreamer, Streamer};
-use sdset::SetBuf;
-use slice_group_by::{GroupBy, GroupByMut};
+use std::time::Duration;

 use crate::database::MainT;
-use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
-use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
-use crate::levenshtein::prefix_damerau_levenshtein;
-use crate::raw_document::{raw_documents_from, RawDocument};
-use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
+use crate::bucket_sort::{bucket_sort, bucket_sort_with_distinct};
+use crate::{criterion::Criteria, Document, DocumentId};
 use crate::{reordered_attrs::ReorderedAttrs, store, MResult};

 pub struct QueryBuilder<'c, 'f, 'd> {
@ -29,249 +18,6 @@ pub struct QueryBuilder<'c, 'f, 'd> {
    synonyms_store: store::Synonyms,
 }

-fn multiword_rewrite_matches(
-    mut matches: Vec<(DocumentId, TmpMatch)>,
-    query_enhancer: &QueryEnhancer,
-) -> SetBuf<(DocumentId, TmpMatch)> {
-    let mut padded_matches = Vec::with_capacity(matches.len());
-
-    // we sort the matches by word index to make them rewritable
-    matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
-
-    // for each attribute of each document
-    for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
-        // padding will only be applied
-        // to word indices in the same attribute
-        let mut padding = 0;
-        let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index);
-
-        // for each match at the same position
-        // in this document attribute
-        while let Some(same_word_index) = iter.next() {
-            // find the biggest padding
-            let mut biggest = 0;
-            for (id, match_) in same_word_index {
-                let mut replacement = query_enhancer.replacement(match_.query_index);
-                let replacement_len = replacement.len();
-                let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
-
-                if let Some(query_index) = replacement.next() {
-                    let word_index = match_.word_index + padding as u16;
-                    let match_ = TmpMatch {
-                        query_index,
-                        word_index,
-                        ..*match_
-                    };
-                    padded_matches.push((*id, match_));
-                }
-
-                let mut found = false;
-
-                // look ahead and if there already is a match
-                // corresponding to this padding word, abort the padding
-                'padding: for (x, next_group) in nexts.enumerate() {
-                    for (i, query_index) in replacement.clone().enumerate().skip(x) {
-                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
-                        let padmatch = TmpMatch {
-                            query_index,
-                            word_index,
-                            ..*match_
-                        };
-
-                        for (_, nmatch_) in next_group {
-                            let mut rep = query_enhancer.replacement(nmatch_.query_index);
-                            let query_index = rep.next().unwrap();
-                            if query_index == padmatch.query_index {
-                                if !found {
-                                    // if we find a corresponding padding for the
-                                    // first time we must push preceding paddings
-                                    for (i, query_index) in replacement.clone().enumerate().take(i)
-                                    {
-                                        let word_index =
-                                            match_.word_index + padding as u16 + (i + 1) as u16;
-                                        let match_ = TmpMatch {
-                                            query_index,
-                                            word_index,
-                                            ..*match_
-                                        };
-                                        padded_matches.push((*id, match_));
-                                        biggest = biggest.max(i + 1);
-                                    }
-                                }
-
-                                padded_matches.push((*id, padmatch));
-                                found = true;
-                                continue 'padding;
-                            }
-                        }
-                    }
-
-                    // if we do not find a corresponding padding in the
-                    // next groups so stop here and pad what was found
-                    break;
-                }
-
-                if !found {
-                    // if no padding was found in the following matches
-                    // we must insert the entire padding
-                    for (i, query_index) in replacement.enumerate() {
-                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
-                        let match_ = TmpMatch {
-                            query_index,
-                            word_index,
-                            ..*match_
-                        };
-                        padded_matches.push((*id, match_));
-                    }
-
-                    biggest = biggest.max(replacement_len - 1);
-                }
-            }
-
-            padding += biggest;
-        }
-    }
-
-    for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) {
-        document_matches.sort_unstable();
-    }
-
-    SetBuf::new_unchecked(padded_matches)
-}
-
-fn fetch_raw_documents(
-    reader: &heed::RoTxn<MainT>,
-    automatons_groups: &[AutomatonGroup],
-    query_enhancer: &QueryEnhancer,
-    searchables: Option<&ReorderedAttrs>,
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    documents_fields_counts_store: store::DocumentsFieldsCounts,
-) -> MResult<Vec<RawDocument>> {
-    let mut matches = Vec::new();
-    let mut highlights = Vec::new();
-
-    for group in automatons_groups {
-        let AutomatonGroup {
-            is_phrase_query,
-            automatons,
-        } = group;
-        let phrase_query_len = automatons.len();
-
-        let mut tmp_matches = Vec::new();
-        for (id, automaton) in automatons.into_iter().enumerate() {
-            let Automaton {
-                index,
-                is_exact,
-                query_len,
-                query,
-                ..
-            } = automaton;
-            let dfa = automaton.dfa();
-
-            let words = match main_store.words_fst(reader)? {
-                Some(words) => words,
-                None => return Ok(Vec::new()),
-            };
-
-            let mut stream = words.search(&dfa).into_stream();
-            while let Some(input) = stream.next() {
-                let distance = dfa.eval(input).to_u8();
-                let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
-
-                let covered_area = if *query_len > input.len() {
-                    input.len()
-                } else {
-                    prefix_damerau_levenshtein(query.as_bytes(), input).1
-                };
-
-                let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
-                    Some(doc_indexes) => doc_indexes,
-                    None => continue,
-                };
-
-                tmp_matches.reserve(doc_indexes.len());
-
-                for di in doc_indexes.as_ref() {
-                    let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
-                    if let Some(attribute) = attribute {
-                        let match_ = TmpMatch {
-                            query_index: *index as u32,
-                            distance,
-                            attribute,
-                            word_index: di.word_index,
-                            is_exact,
-                        };
-
-                        let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value());
-                        let covered_area = cmp::min(covered_area, di.char_length);
-
-                        let highlight = Highlight {
-                            attribute: di.attribute,
-                            char_index: di.char_index,
-                            char_length: covered_area,
-                        };
-
-                        tmp_matches.push((di.document_id, id, match_, highlight));
-                    }
-                }
-            }
-        }
-
-        if *is_phrase_query {
-            tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
-            for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
-                for window in group.windows(2) {
-                    let (ida, ia, ma, ha) = window[0];
-                    let (idb, ib, mb, hb) = window[1];
-
-                    debug_assert_eq!(ida, idb);
-
-                    // if matches must follow and actually follows themselves
-                    if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
-                        // TODO we must make it work for phrase query longer than 2
-                        // if the second match is the last phrase query word
-                        if ib + 1 == phrase_query_len {
-                            // insert first match
-                            matches.push((ida, ma));
-                            highlights.push((ida, ha));
-
-                            // insert second match
-                            matches.push((idb, mb));
-                            highlights.push((idb, hb));
-                        }
-                    }
-                }
-            }
-        } else {
-            for (id, _, match_, highlight) in tmp_matches {
-                matches.push((id, match_));
-                highlights.push((id, highlight));
-            }
-        }
-    }
-
-    let matches = multiword_rewrite_matches(matches, &query_enhancer);
-    let highlights = {
-        highlights.sort_unstable_by_key(|(id, _)| *id);
-        SetBuf::new_unchecked(highlights)
-    };
-
-    let fields_counts = {
-        let mut fields_counts = Vec::new();
-        for group in matches.linear_group_by_key(|(id, ..)| *id) {
-            let id = group[0].0;
-            for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
-                let (attr, count) = result?;
-                fields_counts.push((id, attr, count));
-            }
-        }
-        SetBuf::new(fields_counts).unwrap()
-    };
-
-    Ok(raw_documents_from(matches, highlights, fields_counts))
-}
-
 impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
    pub fn new(
        main: store::Main,
@ -307,9 +53,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
            synonyms_store: synonyms,
        }
    }
-}

-impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
    pub fn with_filter<F>(&mut self, function: F)
    where
        F: Fn(DocumentId) -> bool + 'f,
@ -342,29 +86,25 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
        range: Range<usize>,
    ) -> MResult<Vec<Document>> {
        match self.distinct {
-            Some((distinct, distinct_size)) => raw_query_with_distinct(
+            Some((distinct, distinct_size)) => bucket_sort_with_distinct(
                reader,
                query,
                range,
                self.filter,
                distinct,
                distinct_size,
-                self.timeout,
                self.criteria,
-                self.searchable_attrs,
                self.main_store,
                self.postings_lists_store,
                self.documents_fields_counts_store,
                self.synonyms_store,
            ),
-            None => raw_query(
+            None => bucket_sort(
                reader,
                query,
                range,
                self.filter,
-                self.timeout,
                self.criteria,
-                self.searchable_attrs,
                self.main_store,
                self.postings_lists_store,
                self.documents_fields_counts_store,
@ -374,320 +114,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
    }
 }

-fn raw_query<'c, FI>(
-    reader: &heed::RoTxn<MainT>,
-
-    query: &str,
-    range: Range<usize>,
-
-    filter: Option<FI>,
-    timeout: Option<Duration>,
-
-    criteria: Criteria<'c>,
-    searchable_attrs: Option<ReorderedAttrs>,
-
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    documents_fields_counts_store: store::DocumentsFieldsCounts,
-    synonyms_store: store::Synonyms,
-) -> MResult<Vec<Document>>
-where
-    FI: Fn(DocumentId) -> bool,
-{
-    // We delegate the filter work to the distinct query builder,
-    // specifying a distinct rule that has no effect.
-    if filter.is_some() {
-        let distinct = |_| None;
-        let distinct_size = 1;
-        return raw_query_with_distinct(
-            reader,
-            query,
-            range,
-            filter,
-            distinct,
-            distinct_size,
-            timeout,
-            criteria,
-            searchable_attrs,
-            main_store,
-            postings_lists_store,
-            documents_fields_counts_store,
-            synonyms_store,
-        );
-    }
-
-    let start_processing = Instant::now();
-    let mut raw_documents_processed = Vec::with_capacity(range.len());
-
-    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
-        reader,
-        query,
-        main_store,
-        postings_lists_store,
-        synonyms_store,
-    )?;
-
-    let automaton_producer = automaton_producer.into_iter();
-    let mut automatons = Vec::new();
-
-    // aggregate automatons groups by groups after time
-    for auts in automaton_producer {
-        automatons.push(auts);
-
-        // we must retrieve the documents associated
-        // with the current automatons
-        let mut raw_documents = fetch_raw_documents(
-            reader,
-            &automatons,
-            &query_enhancer,
-            searchable_attrs.as_ref(),
-            main_store,
-            postings_lists_store,
-            documents_fields_counts_store,
-        )?;
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-
-        let mut groups = vec![raw_documents.as_mut_slice()];
-
-        'criteria: for criterion in criteria.as_ref() {
-            let tmp_groups = mem::replace(&mut groups, Vec::new());
-            let mut documents_seen = 0;
-
-            for group in tmp_groups {
-                // if this group does not overlap with the requested range,
-                // push it without sorting and splitting it
-                if documents_seen + group.len() < range.start {
-                    documents_seen += group.len();
-                    groups.push(group);
-                    continue;
-                }
-
-                group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
-
-                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
-                    documents_seen += group.len();
-                    groups.push(group);
-
-                    // we have sort enough documents if the last document sorted is after
-                    // the end of the requested range, we can continue to the next criterion
-                    if documents_seen >= range.end {
-                        continue 'criteria;
-                    }
-                }
-            }
-        }
-
-        // once we classified the documents related to the current
-        // automatons we save that as the next valid result
-        let iter = raw_documents
-            .into_iter()
-            .skip(range.start)
-            .take(range.len());
-        raw_documents_processed.clear();
-        raw_documents_processed.extend(iter);
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-    }
-
-    // make real documents now that we know
-    // those must be returned
-    let documents = raw_documents_processed
-        .into_iter()
-        .map(Document::from_raw)
-        .collect();
-
-    Ok(documents)
-}
-
-fn raw_query_with_distinct<'c, FI, FD>(
-    reader: &heed::RoTxn<MainT>,
-
-    query: &str,
-    range: Range<usize>,
-
-    filter: Option<FI>,
-
-    distinct: FD,
-    distinct_size: usize,
-    timeout: Option<Duration>,
-
-    criteria: Criteria<'c>,
-    searchable_attrs: Option<ReorderedAttrs>,
-
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    documents_fields_counts_store: store::DocumentsFieldsCounts,
-    synonyms_store: store::Synonyms,
-) -> MResult<Vec<Document>>
-where
-    FI: Fn(DocumentId) -> bool,
-    FD: Fn(DocumentId) -> Option<u64>,
-{
-    let start_processing = Instant::now();
-    let mut raw_documents_processed = Vec::new();
-
-    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
-        reader,
-        query,
-        main_store,
-        postings_lists_store,
-        synonyms_store,
-    )?;
-
-    let automaton_producer = automaton_producer.into_iter();
-    let mut automatons = Vec::new();
-
-    // aggregate automatons groups by groups after time
-    for auts in automaton_producer {
-        automatons.push(auts);
-
-        // we must retrieve the documents associated
-        // with the current automatons
-        let mut raw_documents = fetch_raw_documents(
-            reader,
-            &automatons,
-            &query_enhancer,
-            searchable_attrs.as_ref(),
-            main_store,
-            postings_lists_store,
-            documents_fields_counts_store,
-        )?;
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-
-        let mut groups = vec![raw_documents.as_mut_slice()];
-        let mut key_cache = HashMap::new();
-
-        let mut filter_map = HashMap::new();
-        // these two variables informs on the current distinct map and
-        // on the raw offset of the start of the group where the
-        // range.start bound is located according to the distinct function
-        let mut distinct_map = DistinctMap::new(distinct_size);
-        let mut distinct_raw_offset = 0;
-
-        'criteria: for criterion in criteria.as_ref() {
-            let tmp_groups = mem::replace(&mut groups, Vec::new());
-            let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
-            let mut documents_seen = 0;
-
-            for group in tmp_groups {
-                // if this group does not overlap with the requested range,
-                // push it without sorting and splitting it
-                if documents_seen + group.len() < distinct_raw_offset {
-                    documents_seen += group.len();
-                    groups.push(group);
-                    continue;
-                }
-
-                group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
-
-                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
-                    // we must compute the real distinguished len of this sub-group
-                    for document in group.iter() {
-                        let filter_accepted = match &filter {
-                            Some(filter) => {
-                                let entry = filter_map.entry(document.id);
-                                *entry.or_insert_with(|| (filter)(document.id))
-                            }
-                            None => true,
-                        };
-
-                        if filter_accepted {
-                            let entry = key_cache.entry(document.id);
-                            let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
-
-                            match key.clone() {
-                                Some(key) => buf_distinct.register(key),
-                                None => buf_distinct.register_without_key(),
-                            };
-                        }
-
-                        // the requested range end is reached: stop computing distinct
-                        if buf_distinct.len() >= range.end {
-                            break;
-                        }
-                    }
-
-                    documents_seen += group.len();
-                    groups.push(group);
-
-                    // if this sub-group does not overlap with the requested range
-                    // we must update the distinct map and its start index
-                    if buf_distinct.len() < range.start {
-                        buf_distinct.transfert_to_internal();
-                        distinct_raw_offset = documents_seen;
-                    }
-
-                    // we have sort enough documents if the last document sorted is after
-                    // the end of the requested range, we can continue to the next criterion
-                    if buf_distinct.len() >= range.end {
-                        continue 'criteria;
-                    }
-                }
-            }
-        }
-
-        // once we classified the documents related to the current
-        // automatons we save that as the next valid result
-        let mut seen = BufferedDistinctMap::new(&mut distinct_map);
-        raw_documents_processed.clear();
-
-        for document in raw_documents.into_iter().skip(distinct_raw_offset) {
-            let filter_accepted = match &filter {
-                Some(_) => filter_map.remove(&document.id).unwrap(),
-                None => true,
-            };
-
-            if filter_accepted {
-                let key = key_cache.remove(&document.id).unwrap();
-                let distinct_accepted = match key {
-                    Some(key) => seen.register(key),
-                    None => seen.register_without_key(),
-                };
-
-                if distinct_accepted && seen.len() > range.start {
-                    raw_documents_processed.push(document);
-                    if raw_documents_processed.len() == range.len() {
-                        break;
-                    }
-                }
-            }
-        }
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-    }
-
-    // make real documents now that we know
-    // those must be returned
-    let documents = raw_documents_processed
-        .into_iter()
-        .map(Document::from_raw)
-        .collect();
-
-    Ok(documents)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@ -815,7 +241,7 @@ mod tests {

            let mut words_fst = BTreeSet::new();
            let mut postings_lists = HashMap::new();
-            let mut fields_counts = HashMap::<_, u64>::new();
+            let mut fields_counts = HashMap::<_, u16>::new();

            for (word, indexes) in iter {
                let word = word.to_lowercase().into_bytes();
--- a/meilisearch-core/src/query_enhancer.rs
+++ b/meilisearch-core/src/query_enhancer.rs
@ -1,398 +0,0 @@
-use std::ops::Range;
-use std::cmp::Ordering::{Less, Greater, Equal};
-
-/// Return `true` if the specified range can accept the given replacements words.
-/// Returns `false` if the replacements words are already present in the original query
-/// or if there is fewer replacement words than the range to replace.
-//
-//
-// ## Ignored because already present in original
-//
-//     new york city subway
-//     -------- ^^^^
-//   /          \
-//  [new york city]
-//
-//
-// ## Ignored because smaller than the original
-//
-//   new york city subway
-//   -------------
-//   \          /
-//    [new york]
-//
-//
-// ## Accepted because bigger than the original
-//
-//        NYC subway
-//        ---
-//       /   \
-//      /     \
-//     /       \
-//    /         \
-//   /           \
-//  [new york city]
-//
-fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
-where S: AsRef<str>,
-      T: AsRef<str>,
-{
-    if words.len() <= range.len() {
-        // there is fewer or equal replacement words
-        // than there is already in the replaced range
-        return false
-    }
-
-    // retrieve the part to rewrite but with the length
-    // of the replacement part
-    let original = query.iter().skip(range.start).take(words.len());
-
-    // check if the original query doesn't already contain
-    // the replacement words
-    !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
-}
-
-type Origin = usize;
-type RealLength = usize;
-
-struct FakeIntervalTree {
-    intervals: Vec<(Range<usize>, (Origin, RealLength))>,
-}
-
-impl FakeIntervalTree {
-    fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
-        intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
-        FakeIntervalTree { intervals }
-    }
-
-    fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
-        let element = self.intervals.binary_search_by(|(r, _)| {
-            if point >= r.start {
-                if point < r.end { Equal } else { Less }
-            } else { Greater }
-        });
-
-        let n = match element { Ok(n) => n, Err(n) => n };
-
-        match self.intervals.get(n) {
-            Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
-            _otherwise => None,
-        }
-    }
-}
-
-pub struct QueryEnhancerBuilder<'a, S> {
-    query: &'a [S],
-    origins: Vec<usize>,
-    real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
-}
-
-impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
-    pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
-        // we initialize origins query indices based on their positions
-        let origins: Vec<_> = (0..query.len() + 1).collect();
-        let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
-
-        QueryEnhancerBuilder { query, origins, real_to_origin }
-    }
-
-    /// Update the final real to origin query indices mapping.
-    ///
-    /// `range` is the original words range that this `replacement` words replace
-    /// and `real` is the first real query index of these replacement words.
-    pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
-    where T: AsRef<str>,
-    {
-        // check if the range of original words
-        // can be rewritten with the replacement words
-        if rewrite_range_with(self.query, range.clone(), replacement) {
-
-            // this range can be replaced so we need to
-            // modify the origins accordingly
-            let offset = replacement.len() - range.len();
-
-            let previous_padding = self.origins[range.end - 1];
-            let current_offset = (self.origins[range.end] - 1) - previous_padding;
-            let diff = offset.saturating_sub(current_offset);
-            self.origins[range.end] += diff;
-
-            for r in &mut self.origins[range.end + 1..] {
-                *r += diff;
-            }
-        }
-
-        // we need to store the real number and origins relations
-        // this way it will be possible to know by how many
-        // we need to pad real query indices
-        let real_range = real..real + replacement.len().max(range.len());
-        let real_length = replacement.len();
-        self.real_to_origin.push((real_range, (range.start, real_length)));
-    }
-
-    pub fn build(self) -> QueryEnhancer {
-        QueryEnhancer {
-            origins: self.origins,
-            real_to_origin: FakeIntervalTree::new(self.real_to_origin),
-        }
-    }
-}
-
-pub struct QueryEnhancer {
-    origins: Vec<usize>,
-    real_to_origin: FakeIntervalTree,
-}
-
-impl QueryEnhancer {
-    /// Returns the query indices to use to replace this real query index.
-    pub fn replacement(&self, real: u32) -> Range<u32> {
-        let real = real as usize;
-
-        // query the fake interval tree with the real query index
-        let (range, (origin, real_length)) =
-            self.real_to_origin
-                .query(real)
-                .expect("real has never been declared");
-
-        // if `real` is the end bound of the range
-        if (range.start + real_length - 1) == real {
-            let mut count = range.len();
-            let mut new_origin = origin;
-            for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
-                let len = slice[1] - slice[0];
-                count = count.saturating_sub(len);
-                if count == 0 { new_origin = origin + i; break }
-            }
-
-            let n = real - range.start;
-            let start = self.origins[origin];
-            let end = self.origins[new_origin + 1];
-            let remaining = (end - start) - n;
-
-            Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
-
-        } else {
-            // just return the origin along with
-            // the real position of the word
-            let n = real as usize - range.start;
-            let origin = self.origins[origin];
-
-            Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn original_unmodified() {
-        let query = ["new", "york", "city", "subway"];
-        //             0       1       2        3
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // new york = new york city
-        builder.declare(0..2, 4, &["new", "york", "city"]);
-        //                    ^      4       5       6
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0), 0..1); // new
-        assert_eq!(enhancer.replacement(1), 1..2); // york
-        assert_eq!(enhancer.replacement(2), 2..3); // city
-        assert_eq!(enhancer.replacement(3), 3..4); // subway
-        assert_eq!(enhancer.replacement(4), 0..1); // new
-        assert_eq!(enhancer.replacement(5), 1..2); // york
-        assert_eq!(enhancer.replacement(6), 2..3); // city
-    }
-
-    #[test]
-    fn simple_growing() {
-        let query = ["new", "york", "subway"];
-        //             0       1        2
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // new york = new york city
-        builder.declare(0..2, 3, &["new", "york", "city"]);
-        //                    ^      3       4       5
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0), 0..1); // new
-        assert_eq!(enhancer.replacement(1), 1..3); // york
-        assert_eq!(enhancer.replacement(2), 3..4); // subway
-        assert_eq!(enhancer.replacement(3), 0..1); // new
-        assert_eq!(enhancer.replacement(4), 1..2); // york
-        assert_eq!(enhancer.replacement(5), 2..3); // city
-    }
-
-    #[test]
-    fn same_place_growings() {
-        let query = ["NY", "subway"];
-        //             0       1
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // NY = new york
-        builder.declare(0..1, 2, &["new", "york"]);
-        //                    ^      2       3
-
-        // NY = new york city
-        builder.declare(0..1, 4, &["new", "york", "city"]);
-        //                    ^      4       5       6
-
-        // NY = NYC
-        builder.declare(0..1, 7, &["NYC"]);
-        //                    ^      7
-
-        // NY = new york city
-        builder.declare(0..1, 8, &["new", "york", "city"]);
-        //                    ^      8       9      10
-
-        // subway = underground train
-        builder.declare(1..2, 11, &["underground", "train"]);
-        //                    ^          11          12
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0), 0..3); // NY
-        assert_eq!(enhancer.replacement(1), 3..5); // subway
-        assert_eq!(enhancer.replacement(2), 0..1); // new
-        assert_eq!(enhancer.replacement(3), 1..3); // york
-        assert_eq!(enhancer.replacement(4), 0..1); // new
-        assert_eq!(enhancer.replacement(5), 1..2); // york
-        assert_eq!(enhancer.replacement(6), 2..3); // city
-        assert_eq!(enhancer.replacement(7), 0..3); // NYC
-        assert_eq!(enhancer.replacement(8), 0..1); // new
-        assert_eq!(enhancer.replacement(9), 1..2); // york
-        assert_eq!(enhancer.replacement(10), 2..3); // city
-        assert_eq!(enhancer.replacement(11), 3..4); // underground
-        assert_eq!(enhancer.replacement(12), 4..5); // train
-    }
-
-    #[test]
-    fn bigger_growing() {
-        let query = ["NYC", "subway"];
-        //             0        1
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // NYC = new york city
-        builder.declare(0..1, 2, &["new", "york", "city"]);
-        //                    ^      2       3       4
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0), 0..3); // NYC
-        assert_eq!(enhancer.replacement(1), 3..4); // subway
-        assert_eq!(enhancer.replacement(2), 0..1); // new
-        assert_eq!(enhancer.replacement(3), 1..2); // york
-        assert_eq!(enhancer.replacement(4), 2..3); // city
-    }
-
-    #[test]
-    fn middle_query_growing() {
-        let query = ["great", "awesome", "NYC", "subway"];
-        //              0         1        2        3
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // NYC = new york city
-        builder.declare(2..3, 4, &["new", "york", "city"]);
-        //                    ^      4       5       6
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0), 0..1); // great
-        assert_eq!(enhancer.replacement(1), 1..2); // awesome
-        assert_eq!(enhancer.replacement(2), 2..5); // NYC
-        assert_eq!(enhancer.replacement(3), 5..6); // subway
-        assert_eq!(enhancer.replacement(4), 2..3); // new
-        assert_eq!(enhancer.replacement(5), 3..4); // york
-        assert_eq!(enhancer.replacement(6), 4..5); // city
-    }
-
-    #[test]
-    fn end_query_growing() {
-        let query = ["NYC", "subway"];
-        //             0        1
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // NYC = new york city
-        builder.declare(1..2, 2, &["underground", "train"]);
-        //                    ^         2            3
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0), 0..1); // NYC
-        assert_eq!(enhancer.replacement(1), 1..3); // subway
-        assert_eq!(enhancer.replacement(2), 1..2); // underground
-        assert_eq!(enhancer.replacement(3), 2..3); // train
-    }
-
-    #[test]
-    fn multiple_growings() {
-        let query = ["great", "awesome", "NYC", "subway"];
-        //              0         1        2        3
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // NYC = new york city
-        builder.declare(2..3, 4, &["new", "york", "city"]);
-        //                    ^      4       5       6
-
-        // subway = underground train
-        builder.declare(3..4, 7, &["underground", "train"]);
-        //                    ^          7           8
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0), 0..1); // great
-        assert_eq!(enhancer.replacement(1), 1..2); // awesome
-        assert_eq!(enhancer.replacement(2), 2..5); // NYC
-        assert_eq!(enhancer.replacement(3), 5..7); // subway
-        assert_eq!(enhancer.replacement(4), 2..3); // new
-        assert_eq!(enhancer.replacement(5), 3..4); // york
-        assert_eq!(enhancer.replacement(6), 4..5); // city
-        assert_eq!(enhancer.replacement(7), 5..6); // underground
-        assert_eq!(enhancer.replacement(8), 6..7); // train
-    }
-
-    #[test]
-    fn multiple_probable_growings() {
-        let query = ["great", "awesome", "NYC", "subway"];
-        //              0         1        2        3
-        let mut builder = QueryEnhancerBuilder::new(&query);
-
-        // NYC = new york city
-        builder.declare(2..3, 4, &["new", "york", "city"]);
-        //                    ^      4       5       6
-
-        // subway = underground train
-        builder.declare(3..4, 7, &["underground", "train"]);
-        //                    ^          7           8
-
-        // great awesome = good
-        builder.declare(0..2, 9, &["good"]);
-        //                    ^       9
-
-        // awesome NYC = NY
-        builder.declare(1..3, 10, &["NY"]);
-        //                    ^^     10
-
-        // NYC subway = metro
-        builder.declare(2..4, 11, &["metro"]);
-        //                    ^^      11
-
-        let enhancer = builder.build();
-
-        assert_eq!(enhancer.replacement(0),  0..1); // great
-        assert_eq!(enhancer.replacement(1),  1..2); // awesome
-        assert_eq!(enhancer.replacement(2),  2..5); // NYC
-        assert_eq!(enhancer.replacement(3),  5..7); // subway
-        assert_eq!(enhancer.replacement(4),  2..3); // new
-        assert_eq!(enhancer.replacement(5),  3..4); // york
-        assert_eq!(enhancer.replacement(6),  4..5); // city
-        assert_eq!(enhancer.replacement(7),  5..6); // underground
-        assert_eq!(enhancer.replacement(8),  6..7); // train
-        assert_eq!(enhancer.replacement(9),  0..2); // good
-        assert_eq!(enhancer.replacement(10), 1..5); // NY
-        assert_eq!(enhancer.replacement(11), 2..5); // metro
-    }
-}
--- a/meilisearch-core/src/raw_document.rs
+++ b/meilisearch-core/src/raw_document.rs
@ -1,186 +1,89 @@
-use std::fmt;
-use std::sync::Arc;
-
-use meilisearch_schema::SchemaAttr;
+use compact_arena::SmallArena;
+use itertools::EitherOrBoth;
 use sdset::SetBuf;
-use slice_group_by::GroupBy;

-use crate::{DocumentId, Highlight, TmpMatch};
+use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};

-#[derive(Clone)]
-pub struct RawDocument {
-    pub id: DocumentId,
-    pub matches: SharedMatches,
-    pub highlights: Vec<Highlight>,
-    pub fields_counts: SetBuf<(SchemaAttr, u64)>,
+pub struct RawDocument<'a, 'tag> {
+    pub id: crate::DocumentId,
+    pub raw_matches: &'a mut [BareMatch<'tag>],
+    pub processed_matches: Vec<SimpleMatch>,
+    /// The list of minimum `distance` found
+    pub processed_distances: Vec<Option<u8>>,
 }

-impl RawDocument {
-    pub fn query_index(&self) -> &[u32] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe {
-            &self
-                .matches
-                .matches
-                .query_index
-                .get_unchecked(r.start..r.end)
-        }
-    }
+impl<'a, 'tag> RawDocument<'a, 'tag> {
+    pub fn new<'txn>(
+        raw_matches: &'a mut [BareMatch<'tag>],
+        automatons: &[QueryWordAutomaton],
+        postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
+    ) -> Option<RawDocument<'a, 'tag>>
+    {
+        raw_matches.sort_unstable_by_key(|m| m.query_index);

-    pub fn distance(&self) -> &[u8] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
-    }
+        let mut previous_word = None;
+        for i in 0..raw_matches.len() {
+            let a = &raw_matches[i];
+            let auta = &automatons[a.query_index as usize];

-    pub fn attribute(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
-    }
+            match auta.phrase_query {
+                Some((0, _)) => {
+                    let b = match raw_matches.get(i + 1) {
+                        Some(b) => b,
+                        None => {
+                            postings_lists[a.postings_list].rewrite_with(SetBuf::default());
+                            continue;
+                        }
+                    };

-    pub fn word_index(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe {
-            &self
-                .matches
-                .matches
-                .word_index
-                .get_unchecked(r.start..r.end)
-        }
-    }
+                    if a.query_index + 1 != b.query_index {
+                        postings_lists[a.postings_list].rewrite_with(SetBuf::default());
+                        continue
+                    }

-    pub fn is_exact(&self) -> &[bool] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
-    }
-}
+                    let pla = &postings_lists[a.postings_list];
+                    let plb = &postings_lists[b.postings_list];

-impl fmt::Debug for RawDocument {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.write_str("RawDocument {\r\n")?;
-        f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "query_index",
-            self.query_index()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "distance",
-            self.distance()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "attribute",
-            self.attribute()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "word_index",
-            self.word_index()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "is_exact",
-            self.is_exact()
-        ))?;
-        f.write_str("}")?;
-        Ok(())
-    }
-}
+                    let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
+                        a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
+                    });

-pub fn raw_documents_from(
-    matches: SetBuf<(DocumentId, TmpMatch)>,
-    highlights: SetBuf<(DocumentId, Highlight)>,
-    fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
-) -> Vec<RawDocument> {
-    let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
-    let mut matches2 = Matches::with_capacity(matches.len());
+                    let mut newa = Vec::new();
+                    let mut newb = Vec::new();

-    let matches = matches.linear_group_by_key(|(id, _)| *id);
-    let highlights = highlights.linear_group_by_key(|(id, _)| *id);
-    let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
+                    for eb in iter {
+                        if let EitherOrBoth::Both(a, b) = eb {
+                            newa.push(*a);
+                            newb.push(*b);
+                        }
+                    }

-    for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
-        debug_assert_eq!(mgroup[0].0, hgroup[0].0);
-        debug_assert_eq!(mgroup[0].0, fgroup[0].0);
+                    if !newa.is_empty() {
+                        previous_word = Some(a.query_index);
+                    }

-        let document_id = mgroup[0].0;
-        let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
-        let end = start + mgroup.len();
-        let highlights = hgroup.iter().map(|(_, h)| *h).collect();
-        let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
-
-        docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
-        matches2.extend_from_slice(mgroup);
-    }
-
-    let matches = Arc::new(matches2);
-    docs_ranges
-        .into_iter()
-        .map(|(id, range, highlights, fields_counts)| {
-            let matches = SharedMatches {
-                range,
-                matches: matches.clone(),
-            };
-            RawDocument {
-                id,
-                matches,
-                highlights,
-                fields_counts,
+                    postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
+                    postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
+                },
+                Some((1, _)) => {
+                    if previous_word.take() != Some(a.query_index - 1) {
+                        postings_lists[a.postings_list].rewrite_with(SetBuf::default());
+                    }
+                },
+                Some((_, _)) => unreachable!(),
+                None => (),
            }
+        }
+
+        if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
+            return None
+        }
+
+        Some(RawDocument {
+            id: raw_matches[0].document_id,
+            raw_matches,
+            processed_matches: Vec::new(),
+            processed_distances: Vec::new(),
        })
-        .collect()
-}
-
-#[derive(Debug, Copy, Clone)]
-struct Range {
-    start: usize,
-    end: usize,
-}
-
-#[derive(Clone)]
-pub struct SharedMatches {
-    range: Range,
-    matches: Arc<Matches>,
-}
-
-#[derive(Clone)]
-struct Matches {
-    query_index: Vec<u32>,
-    distance: Vec<u8>,
-    attribute: Vec<u16>,
-    word_index: Vec<u16>,
-    is_exact: Vec<bool>,
-}
-
-impl Matches {
-    fn with_capacity(cap: usize) -> Matches {
-        Matches {
-            query_index: Vec::with_capacity(cap),
-            distance: Vec::with_capacity(cap),
-            attribute: Vec::with_capacity(cap),
-            word_index: Vec::with_capacity(cap),
-            is_exact: Vec::with_capacity(cap),
-        }
-    }
-
-    fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
-        for (_, match_) in matches {
-            self.query_index.push(match_.query_index);
-            self.distance.push(match_.distance);
-            self.attribute.push(match_.attribute);
-            self.word_index.push(match_.word_index);
-            self.is_exact.push(match_.is_exact);
-        }
    }
 }
--- a/meilisearch-core/src/serde/serializer.rs
+++ b/meilisearch-core/src/serde/serializer.rs
@ -325,7 +325,7 @@ where
                txn,
                document_id,
                attribute,
-                number_of_words as u64,
+                number_of_words as u16,
            )?;
        }
    }
--- a/meilisearch-core/src/store/documents_fields_counts.rs
+++ b/meilisearch-core/src/store/documents_fields_counts.rs
@ -7,7 +7,7 @@ use meilisearch_schema::SchemaAttr;

 #[derive(Copy, Clone)]
 pub struct DocumentsFieldsCounts {
-    pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
+    pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u16>>,
 }

 impl DocumentsFieldsCounts {
@ -16,7 +16,7 @@ impl DocumentsFieldsCounts {
        writer: &mut heed::RwTxn<MainT>,
        document_id: DocumentId,
        attribute: SchemaAttr,
-        value: u64,
+        value: u16,
    ) -> ZResult<()> {
        let key = DocumentAttrKey::new(document_id, attribute);
        self.documents_fields_counts.put(writer, &key, &value)
@ -42,7 +42,7 @@ impl DocumentsFieldsCounts {
        reader: &heed::RoTxn<MainT>,
        document_id: DocumentId,
        attribute: SchemaAttr,
-    ) -> ZResult<Option<u64>> {
+    ) -> ZResult<Option<u16>> {
        let key = DocumentAttrKey::new(document_id, attribute);
        match self.documents_fields_counts.get(reader, &key)? {
            Some(count) => Ok(Some(count)),
@ -79,11 +79,11 @@ impl DocumentsFieldsCounts {
 }

 pub struct DocumentFieldsCountsIter<'txn> {
-    iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
+    iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
 }

 impl Iterator for DocumentFieldsCountsIter<'_> {
-    type Item = ZResult<(SchemaAttr, u64)>;
+    type Item = ZResult<(SchemaAttr, u16)>;

    fn next(&mut self) -> Option<Self::Item> {
        match self.iter.next() {
@ -99,7 +99,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {

 pub struct DocumentsIdsIter<'txn> {
    last_seen_id: Option<DocumentId>,
-    iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
+    iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
 }

 impl Iterator for DocumentsIdsIter<'_> {
@ -123,11 +123,11 @@ impl Iterator for DocumentsIdsIter<'_> {
 }

 pub struct AllDocumentsFieldsCountsIter<'txn> {
-    iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
+    iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
 }

 impl Iterator for AllDocumentsFieldsCountsIter<'_> {
-    type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
+    type Item = ZResult<(DocumentId, SchemaAttr, u16)>;

    fn next(&mut self) -> Option<Self::Item> {
        match self.iter.next() {
--- a/meilisearch-http/src/helpers/meilisearch.rs
+++ b/meilisearch-http/src/helpers/meilisearch.rs
@ -303,11 +303,11 @@ impl<'a> SearchBuilder<'a> {
            if let Some(ranking_rules_order) = ranking_order {
                for rule in ranking_rules_order {
                    match rule.as_str() {
-                        "_sum_of_typos" => builder.push(SumOfTypos),
-                        "_number_of_words" => builder.push(NumberOfWords),
-                        "_word_proximity" => builder.push(WordsProximity),
-                        "_sum_of_words_attribute" => builder.push(SumOfWordsAttribute),
-                        "_sum_of_words_position" => builder.push(SumOfWordsPosition),
+                        "_typo" => builder.push(Typo),
+                        "_words" => builder.push(Words),
+                        "_proximity" => builder.push(Proximity),
+                        "_attribute" => builder.push(Attribute),
+                        "_words_position" => builder.push(WordsPosition),
                        "_exact" => builder.push(Exact),
                        _ => {
                            let order = match ranking_rules.get(rule.as_str()) {
@ -333,11 +333,11 @@ impl<'a> SearchBuilder<'a> {
                builder.push(DocumentId);
                return Ok(Some(builder.build()));
            } else {
-                builder.push(SumOfTypos);
-                builder.push(NumberOfWords);
-                builder.push(WordsProximity);
-                builder.push(SumOfWordsAttribute);
-                builder.push(SumOfWordsPosition);
+                builder.push(Typo);
+                builder.push(Words);
+                builder.push(Proximity);
+                builder.push(Attribute);
+                builder.push(WordsPosition);
                builder.push(Exact);
                for (rule, order) in ranking_rules.iter() {
                    let custom_ranking = match order {
--- a/meilisearch-types/src/lib.rs
+++ b/meilisearch-types/src/lib.rs
@ -63,3 +63,11 @@ pub struct Highlight {
    /// without needing to run the tokenizer again.
    pub char_length: u16,
 }
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
+#[repr(C)]
+pub struct AttrCount {
+    pub attr: u16,
+    pub count: u16,
+}
Author	SHA1	Message	Date
Clément Renault	a8c3438887	Introduce ContextMut and Context structs	2019-12-12 11:33:39 +01:00
Clément Renault	03e06a80b2	Prefer summing the attribute	2019-12-11 18:37:26 +01:00
Clément Renault	a303b6552a	Introduce bucket_sort_with_distinct function	2019-12-11 18:19:54 +01:00
Clément Renault	96a6f513ab	Update the criteria to the new ones	2019-12-11 17:02:10 +01:00
Clément Renault	4b1127cd98	Remove the raw_query functions	2019-12-11 15:34:30 +01:00
Clément Renault	8db102b8ed	Bump the sdset dependency to 0.3.6	2019-12-11 12:08:30 +01:00
Clément Renault	746ea3db67	Rewrite the phrase query postings lists This simplified the multiword_rewrite_matches function a little bit.	2019-12-10 13:38:28 +01:00
Clément Renault	9f2d430e96	Debug pre filtered number of documents	2019-12-09 16:45:06 +01:00
Clément Renault	daeb226a3f	First probably working phrase query doc filtering	2019-12-09 15:30:14 +01:00
Clément Renault	2eeae7cfdc	Fix the processed distance algorithm	2019-12-08 12:33:59 +01:00
Clément Renault	ad958d38e8	squash-me: Bad Typo detection I have an issue where "speakers" is split into "speaker" and "s", when I compute the distances for the Typo criterion, it takes "s" into account and put a distance of zero in the bucket 0 (the "speakers" bucket), therefore it reports any document matching "s" without typos as best results. I need to make sure to ignore "s" when its associated part "speaker" doesn't even exist in the document and is not in the place it should be ("speaker" followed by "s"). This is hard to think that it will had much computation time to the Typo criterion like in the previous algorithm where I computed the real query/words indexes based and removed the invalid ones before sending the documents to the bucket sort.	2019-12-06 19:18:17 +01:00
Clément Renault	96c3b98e68	Make the Typo and Words work with synonyms	2019-12-06 13:41:22 +01:00
Clément Renault	2a7b34787b	Improve the QueryEnhancer by doing a single lookup	2019-12-06 12:10:28 +01:00
Clément Renault	60c4292172	squash-me: It seems like we support synonyms, split and concat words	2019-12-05 19:26:10 +01:00
Clément Renault	00174d9165	squash-me: Added support for the the QueryEnhancer	2019-12-05 17:45:49 +01:00
Clément Renault	e7654ffa1e	squash-me: Improve the higlighted area	2019-12-05 14:35:38 +01:00
Clément Renault	8a17a8d949	squash-me	2019-12-05 12:11:43 +01:00
Clément Renault	f21e0bffe2	squash-me	2019-12-04 17:58:02 +01:00
Clément Renault	7361dba079	squash-me: Add debug	2019-12-01 23:00:23 +01:00
Clément Renault	5bc18fa704	squash-me: Add debug	2019-12-01 20:23:19 +01:00
Clément Renault	3c8e4a3884	Make example support stdin using -	2019-11-30 16:53:34 +01:00
Clément Renault	6e808e4b8f	Add more debug infos	2019-11-30 16:33:48 +01:00
Clément Renault	ee3a3cedf3	Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion	2019-11-30 10:28:29 +01:00
Clément Renault	c4320b8b14	Introduce the AttrCount type	2019-11-30 10:21:00 +01:00
Clément Renault	2c5da9aa11	sqaush-me: Improve benchmarks naming	2019-11-29 12:28:46 +01:00
Clément Renault	eeb01c749c	Add some criterion benchmarks to help mesure improvements	2019-11-29 12:12:55 +01:00