mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge pull request #38 from meilisearch/facet-queries
Introduce a facet filter system
This commit is contained in:
		
							
								
								
									
										143
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										143
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -45,6 +45,27 @@ version = "1.2.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" | ||||
|  | ||||
| [[package]] | ||||
| name = "block-buffer" | ||||
| version = "0.7.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" | ||||
| dependencies = [ | ||||
|  "block-padding", | ||||
|  "byte-tools", | ||||
|  "byteorder", | ||||
|  "generic-array", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "block-padding" | ||||
| version = "0.1.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" | ||||
| dependencies = [ | ||||
|  "byte-tools", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "bstr" | ||||
| version = "0.2.13" | ||||
| @@ -63,6 +84,12 @@ version = "3.4.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" | ||||
|  | ||||
| [[package]] | ||||
| name = "byte-tools" | ||||
| version = "0.3.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" | ||||
|  | ||||
| [[package]] | ||||
| name = "byteorder" | ||||
| version = "1.3.4" | ||||
| @@ -285,12 +312,27 @@ dependencies = [ | ||||
|  "memchr", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "digest" | ||||
| version = "0.8.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" | ||||
| dependencies = [ | ||||
|  "generic-array", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "either" | ||||
| version = "1.6.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" | ||||
|  | ||||
| [[package]] | ||||
| name = "fake-simd" | ||||
| version = "0.1.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" | ||||
|  | ||||
| [[package]] | ||||
| name = "flate2" | ||||
| version = "1.0.17" | ||||
| @@ -324,6 +366,15 @@ dependencies = [ | ||||
|  "byteorder", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "generic-array" | ||||
| version = "0.12.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec" | ||||
| dependencies = [ | ||||
|  "typenum", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "getrandom" | ||||
| version = "0.1.14" | ||||
| @@ -378,9 +429,9 @@ dependencies = [ | ||||
|  | ||||
| [[package]] | ||||
| name = "heed" | ||||
| version = "0.10.1" | ||||
| version = "0.10.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" | ||||
| checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" | ||||
| dependencies = [ | ||||
|  "byteorder", | ||||
|  "heed-traits", | ||||
| @@ -617,9 +668,12 @@ dependencies = [ | ||||
|  "maplit", | ||||
|  "memmap", | ||||
|  "near-proximity", | ||||
|  "num-traits", | ||||
|  "obkv", | ||||
|  "once_cell", | ||||
|  "ordered-float", | ||||
|  "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", | ||||
|  "pest_derive", | ||||
|  "rayon", | ||||
|  "ringtail", | ||||
|  "roaring", | ||||
| @@ -675,9 +729,9 @@ dependencies = [ | ||||
|  | ||||
| [[package]] | ||||
| name = "num-traits" | ||||
| version = "0.2.12" | ||||
| version = "0.2.14" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611" | ||||
| checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" | ||||
| dependencies = [ | ||||
|  "autocfg", | ||||
| ] | ||||
| @@ -716,6 +770,12 @@ version = "11.1.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" | ||||
|  | ||||
| [[package]] | ||||
| name = "opaque-debug" | ||||
| version = "0.2.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" | ||||
|  | ||||
| [[package]] | ||||
| name = "ordered-float" | ||||
| version = "2.0.0" | ||||
| @@ -741,6 +801,57 @@ version = "2.1.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" | ||||
|  | ||||
| [[package]] | ||||
| name = "pest" | ||||
| version = "2.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" | ||||
| dependencies = [ | ||||
|  "ucd-trie", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest" | ||||
| version = "2.1.3" | ||||
| source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" | ||||
| dependencies = [ | ||||
|  "ucd-trie", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest_derive" | ||||
| version = "2.1.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" | ||||
| dependencies = [ | ||||
|  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "pest_generator", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest_generator" | ||||
| version = "2.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" | ||||
| dependencies = [ | ||||
|  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "pest_meta", | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "syn", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest_meta" | ||||
| version = "2.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" | ||||
| dependencies = [ | ||||
|  "maplit", | ||||
|  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "sha-1", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pkg-config" | ||||
| version = "0.3.19" | ||||
| @@ -1025,6 +1136,18 @@ dependencies = [ | ||||
|  "serde", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "sha-1" | ||||
| version = "0.8.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" | ||||
| dependencies = [ | ||||
|  "block-buffer", | ||||
|  "digest", | ||||
|  "fake-simd", | ||||
|  "opaque-debug", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "slice-group-by" | ||||
| version = "0.2.6" | ||||
| @@ -1233,6 +1356,18 @@ version = "0.1.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" | ||||
|  | ||||
| [[package]] | ||||
| name = "typenum" | ||||
| version = "1.12.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" | ||||
|  | ||||
| [[package]] | ||||
| name = "ucd-trie" | ||||
| version = "0.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" | ||||
|  | ||||
| [[package]] | ||||
| name = "unicode-bidi" | ||||
| version = "0.3.4" | ||||
|   | ||||
| @@ -14,13 +14,14 @@ flate2 = "1.0.17" | ||||
| fst = "0.4.4" | ||||
| fxhash = "0.2.1" | ||||
| grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } | ||||
| heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||
| heed = { version = "0.10.4", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||
| human_format = "1.0.3" | ||||
| jemallocator = "0.3.2" | ||||
| levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | ||||
| linked-hash-map = "0.5.3" | ||||
| memmap = "0.7.0" | ||||
| near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } | ||||
| num-traits = "0.2.14" | ||||
| obkv = "0.1.0" | ||||
| once_cell = "1.4.0" | ||||
| ordered-float = "2.0.0" | ||||
| @@ -36,6 +37,10 @@ structopt = { version = "0.3.14", default-features = false, features = ["wrap_he | ||||
| tempfile = "3.1.0" | ||||
| uuid = { version = "0.8.1", features = ["v4"] } | ||||
|  | ||||
| # facet filter parser | ||||
| pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } | ||||
| pest_derive = "2.1.0" | ||||
|  | ||||
| # documents words self-join | ||||
| itertools = "0.9.0" | ||||
|  | ||||
|   | ||||
							
								
								
									
										70
									
								
								http-ui/Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										70
									
								
								http-ui/Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -654,9 +654,9 @@ dependencies = [ | ||||
|  | ||||
| [[package]] | ||||
| name = "heed" | ||||
| version = "0.10.1" | ||||
| version = "0.10.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" | ||||
| checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" | ||||
| dependencies = [ | ||||
|  "byteorder", | ||||
|  "heed-traits", | ||||
| @@ -934,6 +934,12 @@ dependencies = [ | ||||
|  "cfg-if 0.1.10", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "maplit" | ||||
| version = "1.0.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" | ||||
|  | ||||
| [[package]] | ||||
| name = "matches" | ||||
| version = "0.1.8" | ||||
| @@ -987,9 +993,12 @@ dependencies = [ | ||||
|  "log", | ||||
|  "memmap", | ||||
|  "near-proximity", | ||||
|  "num-traits", | ||||
|  "obkv", | ||||
|  "once_cell", | ||||
|  "ordered-float", | ||||
|  "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", | ||||
|  "pest_derive", | ||||
|  "rayon", | ||||
|  "ringtail", | ||||
|  "roaring", | ||||
| @@ -1231,6 +1240,57 @@ version = "2.1.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" | ||||
|  | ||||
| [[package]] | ||||
| name = "pest" | ||||
| version = "2.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" | ||||
| dependencies = [ | ||||
|  "ucd-trie", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest" | ||||
| version = "2.1.3" | ||||
| source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" | ||||
| dependencies = [ | ||||
|  "ucd-trie", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest_derive" | ||||
| version = "2.1.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" | ||||
| dependencies = [ | ||||
|  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "pest_generator", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest_generator" | ||||
| version = "2.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" | ||||
| dependencies = [ | ||||
|  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "pest_meta", | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "syn", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pest_meta" | ||||
| version = "2.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" | ||||
| dependencies = [ | ||||
|  "maplit", | ||||
|  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "sha-1 0.8.2", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pin-project" | ||||
| version = "0.4.27" | ||||
| @@ -2024,6 +2084,12 @@ version = "1.12.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" | ||||
|  | ||||
| [[package]] | ||||
| name = "ucd-trie" | ||||
| version = "0.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" | ||||
|  | ||||
| [[package]] | ||||
| name = "unicase" | ||||
| version = "2.6.0" | ||||
|   | ||||
| @@ -8,7 +8,7 @@ edition = "2018" | ||||
| [dependencies] | ||||
| anyhow = "1.0.28" | ||||
| grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } | ||||
| heed = "0.10.1" | ||||
| heed = "0.10.4" | ||||
| memmap = "0.7.0" | ||||
| milli = { path = ".." } | ||||
| once_cell = "1.4.1" | ||||
|   | ||||
| @@ -1,8 +1,9 @@ | ||||
| var request = null; | ||||
| var timeoutID = null; | ||||
|  | ||||
| $('#search').on('input', function () { | ||||
|   var query = $(this).val(); | ||||
| $('#query, #facet').on('input', function () { | ||||
|   var query = $('#query').val(); | ||||
|   var facet = $('#facet').val(); | ||||
|   var timeoutMs = 100; | ||||
|  | ||||
|   if (timeoutID !== null) { | ||||
| @@ -14,7 +15,7 @@ $('#search').on('input', function () { | ||||
|       type: "POST", | ||||
|       url: "query", | ||||
|       contentType: 'application/json', | ||||
|       data: JSON.stringify({ 'query': query }), | ||||
|       data: JSON.stringify({ 'query': query, 'facetCondition': facet }), | ||||
|       contentType: 'application/json', | ||||
|       success: function (data, textStatus, request) { | ||||
|         results.innerHTML = ''; | ||||
| @@ -77,5 +78,5 @@ $('#db-size').text(function(index, text) { | ||||
| // We trigger the input when we load the script, this way | ||||
| // we execute a placeholder search when the input is empty. | ||||
| $(window).on('load', function () { | ||||
|   $('#search').trigger('input'); | ||||
|   $('#query').trigger('input'); | ||||
| }); | ||||
|   | ||||
| @@ -2,6 +2,7 @@ use std::borrow::Cow; | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::fs::{File, create_dir_all}; | ||||
| use std::net::SocketAddr; | ||||
| use std::num::NonZeroUsize; | ||||
| use std::path::PathBuf; | ||||
| use std::str::FromStr; | ||||
| use std::sync::Arc; | ||||
| @@ -28,7 +29,7 @@ use warp::{Filter, http::Response}; | ||||
| use milli::tokenizer::{simple_tokenizer, TokenType}; | ||||
| use milli::update::UpdateIndexingStep::*; | ||||
| use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; | ||||
| use milli::{obkv_to_json, Index, UpdateStore, SearchResult}; | ||||
| use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; | ||||
|  | ||||
| static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new(); | ||||
|  | ||||
| @@ -196,6 +197,7 @@ enum UpdateMeta { | ||||
|     DocumentsAddition { method: String, format: String }, | ||||
|     ClearDocuments, | ||||
|     Settings(Settings), | ||||
|     Facets(Facets), | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | ||||
| @@ -231,6 +233,14 @@ struct Settings { | ||||
|     faceted_attributes: Option<HashMap<String, String>>, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | ||||
| #[serde(deny_unknown_fields)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| struct Facets { | ||||
|     level_group_size: Option<NonZeroUsize>, | ||||
|     min_level_size: Option<NonZeroUsize>, | ||||
| } | ||||
|  | ||||
| // Any value that is present is considered Some value, including null. | ||||
| fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error> | ||||
| where T: Deserialize<'de>, | ||||
| @@ -399,6 +409,21 @@ async fn main() -> anyhow::Result<()> { | ||||
|                         Ok(_count) => wtxn.commit().map_err(Into::into), | ||||
|                         Err(e) => Err(e.into()) | ||||
|                     } | ||||
|                 }, | ||||
|                 UpdateMeta::Facets(levels) => { | ||||
|                     // We must use the write transaction of the update here. | ||||
|                     let mut wtxn = index_cloned.write_txn()?; | ||||
|                     let mut builder = update_builder.facets(&mut wtxn, &index_cloned); | ||||
|                     if let Some(value) = levels.level_group_size { | ||||
|                         builder.level_group_size(value); | ||||
|                     } | ||||
|                     if let Some(value) = levels.min_level_size { | ||||
|                         builder.min_level_size(value); | ||||
|                     } | ||||
|                     match builder.execute() { | ||||
|                         Ok(()) => wtxn.commit().map_err(Into::into), | ||||
|                         Err(e) => Err(e.into()) | ||||
|                     } | ||||
|                 } | ||||
|             }; | ||||
|  | ||||
| @@ -550,9 +575,12 @@ async fn main() -> anyhow::Result<()> { | ||||
|             .body(include_str!("../public/logo-black.svg")) | ||||
|         ); | ||||
|  | ||||
|     #[derive(Deserialize)] | ||||
|     #[derive(Debug, Deserialize)] | ||||
|     #[serde(deny_unknown_fields)] | ||||
|     #[serde(rename_all = "camelCase")] | ||||
|     struct QueryBody { | ||||
|         query: Option<String>, | ||||
|         facet_condition: Option<String>, | ||||
|     } | ||||
|  | ||||
|     let disable_highlighting = opt.disable_highlighting; | ||||
| @@ -569,6 +597,12 @@ async fn main() -> anyhow::Result<()> { | ||||
|             if let Some(query) = query.query { | ||||
|                 search.query(query); | ||||
|             } | ||||
|             if let Some(condition) = query.facet_condition { | ||||
|                 if !condition.trim().is_empty() { | ||||
|                     let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); | ||||
|                     search.facet_condition(condition); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             let SearchResult { found_words, documents_ids } = search.execute().unwrap(); | ||||
|  | ||||
| @@ -751,6 +785,19 @@ async fn main() -> anyhow::Result<()> { | ||||
|             Ok(warp::reply()) | ||||
|         }); | ||||
|  | ||||
|     let update_store_cloned = update_store.clone(); | ||||
|     let update_status_sender_cloned = update_status_sender.clone(); | ||||
|     let change_facet_levels_route = warp::filters::method::post() | ||||
|         .and(warp::path!("facet-level-sizes")) | ||||
|         .and(warp::body::json()) | ||||
|         .map(move |levels: Facets| { | ||||
|             let meta = UpdateMeta::Facets(levels); | ||||
|             let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); | ||||
|             let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); | ||||
|             eprintln!("update {} registered", update_id); | ||||
|             warp::reply() | ||||
|         }); | ||||
|  | ||||
|     let update_ws_route = warp::ws() | ||||
|         .and(warp::path!("updates" / "ws")) | ||||
|         .map(move |ws: warp::ws::Ws| { | ||||
| @@ -799,6 +846,7 @@ async fn main() -> anyhow::Result<()> { | ||||
|         .or(indexing_json_stream_route) | ||||
|         .or(clearing_route) | ||||
|         .or(change_settings_route) | ||||
|         .or(change_facet_levels_route) | ||||
|         .or(update_ws_route); | ||||
|  | ||||
|     let addr = SocketAddr::from_str(&opt.http_listen_addr)?; | ||||
|   | ||||
| @@ -55,7 +55,8 @@ | ||||
|         <div class="level-left"> | ||||
|           <div class="level-item"> | ||||
|             <div class="field has-addons has-addons-right"> | ||||
|               <input id="search" class="input" type="text" autofocus placeholder="e.g. George Clooney"> | ||||
|               <input id="query" class="input" type="text" autofocus placeholder="e.g. George Clooney"> | ||||
|               <input id="facet" class="input" type="text" placeholder="facet filter like released >= 1577836800"> | ||||
|             </div> | ||||
|           </div> | ||||
|           <div class="level-item"></div> | ||||
|   | ||||
							
								
								
									
										86
									
								
								src/heed_codec/facet/facet_level_value_f64_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										86
									
								
								src/heed_codec/facet/facet_level_value_f64_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,86 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
|  | ||||
| use crate::facet::value_encoding::f64_into_bytes; | ||||
|  | ||||
| // TODO do not de/serialize right bound when level = 0 | ||||
| pub struct FacetLevelValueF64Codec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { | ||||
|     type DItem = (u8, u8, f64, f64); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id, bytes) = bytes.split_first()?; | ||||
|         let (level, bytes) = bytes.split_first()?; | ||||
|  | ||||
|         let (left, right) = if *level != 0 { | ||||
|             let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; | ||||
|             let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; | ||||
|             (left, right) | ||||
|         } else { | ||||
|             let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; | ||||
|             (left, left) | ||||
|         }; | ||||
|  | ||||
|         Some((*field_id, *level, left, right)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { | ||||
|     type EItem = (u8, u8, f64, f64); | ||||
|  | ||||
|     fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let mut buffer = [0u8; 32]; | ||||
|  | ||||
|         let len = if *level != 0 { | ||||
|             // Write the globally ordered floats. | ||||
|             let bytes = f64_into_bytes(*left)?; | ||||
|             buffer[..8].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|             let bytes = f64_into_bytes(*right)?; | ||||
|             buffer[8..16].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|             // Then the f64 values just to be able to read them back. | ||||
|             let bytes = left.to_be_bytes(); | ||||
|             buffer[16..24].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|             let bytes = right.to_be_bytes(); | ||||
|             buffer[24..].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|             32 // length | ||||
|         } else { | ||||
|             // Write the globally ordered floats. | ||||
|             let bytes = f64_into_bytes(*left)?; | ||||
|             buffer[..8].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|             // Then the f64 values just to be able to read them back. | ||||
|             let bytes = left.to_be_bytes(); | ||||
|             buffer[8..16].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|             16 // length | ||||
|         }; | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(len + 2); | ||||
|         bytes.push(*field_id); | ||||
|         bytes.push(*level); | ||||
|         bytes.extend_from_slice(&buffer[..len]); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use heed::{BytesEncode, BytesDecode}; | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn globally_ordered_f64() { | ||||
|         let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); | ||||
|         let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); | ||||
|         assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); | ||||
|  | ||||
|         let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); | ||||
|         let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); | ||||
|         assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										43
									
								
								src/heed_codec/facet/facet_level_value_i64_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								src/heed_codec/facet/facet_level_value_i64_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
|  | ||||
| use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; | ||||
|  | ||||
| pub struct FacetLevelValueI64Codec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec { | ||||
|     type DItem = (u8, u8, i64, i64); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id, bytes) = bytes.split_first()?; | ||||
|         let (level, bytes) = bytes.split_first()?; | ||||
|  | ||||
|         let left = bytes[..8].try_into().map(i64_from_bytes).ok()?; | ||||
|         let right = if *level != 0 { | ||||
|             bytes[8..].try_into().map(i64_from_bytes).ok()? | ||||
|         } else { | ||||
|             left | ||||
|         }; | ||||
|  | ||||
|         Some((*field_id, *level, left, right)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for FacetLevelValueI64Codec { | ||||
|     type EItem = (u8, u8, i64, i64); | ||||
|  | ||||
|     fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let left = i64_into_bytes(*left); | ||||
|         let right = i64_into_bytes(*right); | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(2 + left.len() + right.len()); | ||||
|         bytes.push(*field_id); | ||||
|         bytes.push(*level); | ||||
|         bytes.extend_from_slice(&left[..]); | ||||
|         if *level != 0 { | ||||
|             bytes.extend_from_slice(&right[..]); | ||||
|         } | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
| @@ -1,50 +0,0 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
|  | ||||
| use crate::facet::value_encoding::f64_into_bytes; | ||||
|  | ||||
| pub struct FacetValueF64Codec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { | ||||
|     type DItem = (u8, f64); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id, buffer) = bytes.split_first()?; | ||||
|         let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; | ||||
|         Some((*field_id, value)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for FacetValueF64Codec { | ||||
|     type EItem = (u8, f64); | ||||
|  | ||||
|     fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let mut buffer = [0u8; 16]; | ||||
|  | ||||
|         // Write the globally ordered float. | ||||
|         let bytes = f64_into_bytes(*value)?; | ||||
|         buffer[..8].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|         // Then the f64 value just to be able to read it back. | ||||
|         let bytes = value.to_be_bytes(); | ||||
|         buffer[8..].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(buffer.len() + 1); | ||||
|         bytes.push(*field_id); | ||||
|         bytes.extend_from_slice(&buffer[..]); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use heed::{BytesEncode, BytesDecode}; | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn globally_ordered_f64() { | ||||
|         let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap(); | ||||
|         let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); | ||||
|         assert_eq!((name, value), (3, -32.0)); | ||||
|     } | ||||
| } | ||||
| @@ -1,28 +0,0 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
|  | ||||
| use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; | ||||
|  | ||||
| pub struct FacetValueI64Codec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { | ||||
|     type DItem = (u8, i64); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id, buffer) = bytes.split_first()?; | ||||
|         let value = buffer.try_into().map(i64_from_bytes).ok()?; | ||||
|         Some((*field_id, value)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for FacetValueI64Codec { | ||||
|     type EItem = (u8, i64); | ||||
|  | ||||
|     fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let value = i64_into_bytes(*value); | ||||
|         let mut bytes = Vec::with_capacity(value.len() + 1); | ||||
|         bytes.push(*field_id); | ||||
|         bytes.extend_from_slice(&value[..]); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
| @@ -1,7 +1,7 @@ | ||||
| mod facet_value_f64_codec; | ||||
| mod facet_value_i64_codec; | ||||
| mod facet_level_value_f64_codec; | ||||
| mod facet_level_value_i64_codec; | ||||
| mod facet_value_string_codec; | ||||
|  | ||||
| pub use self::facet_value_f64_codec::FacetValueF64Codec; | ||||
| pub use self::facet_value_i64_codec::FacetValueI64Codec; | ||||
| pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; | ||||
| pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec; | ||||
| pub use self::facet_value_string_codec::FacetValueStringCodec; | ||||
|   | ||||
							
								
								
									
										22
									
								
								src/index.rs
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								src/index.rs
									
									
									
									
									
								
							| @@ -18,6 +18,7 @@ use crate::{ | ||||
|  | ||||
| pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; | ||||
| pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||
| pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; | ||||
| pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; | ||||
| pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; | ||||
| pub const PRIMARY_KEY_KEY: &str = "primary-key"; | ||||
| @@ -224,6 +225,27 @@ impl Index { | ||||
|         Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
|     /* faceted documents ids */ | ||||
|  | ||||
|     /// Writes the documents ids that are faceted under this field id. | ||||
|     pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> { | ||||
|         let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|         *buffer.last_mut().unwrap() = field_id; | ||||
|         self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) | ||||
|     } | ||||
|  | ||||
|     /// Retrieve all the documents ids that faceted under this field id. | ||||
|     pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result<RoaringBitmap> { | ||||
|         let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|         *buffer.last_mut().unwrap() = field_id; | ||||
|         match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { | ||||
|             Some(docids) => Ok(docids), | ||||
|             None => Ok(RoaringBitmap::new()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /* words fst */ | ||||
|  | ||||
|     /// Writes the FST which is the words dictionnary of the engine. | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| #[macro_use] extern crate pest_derive; | ||||
|  | ||||
| mod criterion; | ||||
| mod external_documents_ids; | ||||
| mod fields_ids_map; | ||||
| @@ -24,7 +26,7 @@ pub use self::criterion::{Criterion, default_criteria}; | ||||
| pub use self::external_documents_ids::ExternalDocumentsIds; | ||||
| pub use self::fields_ids_map::FieldsIdsMap; | ||||
| pub use self::index::Index; | ||||
| pub use self::search::{Search, SearchResult}; | ||||
| pub use self::search::{Search, FacetCondition, SearchResult}; | ||||
| pub use self::heed_codec::{ | ||||
|     RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, | ||||
|     ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||
|   | ||||
							
								
								
									
										29
									
								
								src/search/facet/grammar.pest
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								src/search/facet/grammar.pest
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| key = _{quoted | word} | ||||
| value = _{quoted | word} | ||||
| quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP  } | ||||
| string = {char*} | ||||
| word = ${(LETTER | NUMBER | "_" | "-" | ".")+} | ||||
|  | ||||
| char =  _{ !(PEEK | "\\") ~ ANY | ||||
|     | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | ||||
|     | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} | ||||
|  | ||||
| condition = _{between | eq | greater | less | geq | leq | neq} | ||||
| between = {key ~ value ~ "TO" ~ value} | ||||
| geq = {key ~ ">=" ~ value} | ||||
| leq = {key ~ "<=" ~ value} | ||||
| neq = {key ~ "!=" ~ value} | ||||
| eq = {key ~ "=" ~ value} | ||||
| greater = {key ~ ">" ~ value} | ||||
| less = {key ~ "<" ~ value} | ||||
|  | ||||
| prgm = {SOI ~ expr ~ EOI} | ||||
| expr = _{ ( term ~ (operation ~ term)* ) } | ||||
| term = { ("(" ~ expr ~ ")") | condition | not } | ||||
| operation = _{ and | or } | ||||
| and = {"AND"} | ||||
| or = {"OR"} | ||||
|  | ||||
| not = {"NOT" ~ term} | ||||
|  | ||||
| WHITESPACE = _{ " " } | ||||
							
								
								
									
										655
									
								
								src/search/facet/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										655
									
								
								src/search/facet/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,655 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::fmt::Debug; | ||||
| use std::ops::Bound::{self, Unbounded, Included, Excluded}; | ||||
| use std::str::FromStr; | ||||
|  | ||||
| use heed::types::{ByteSlice, DecodeIgnore}; | ||||
| use log::debug; | ||||
| use num_traits::Bounded; | ||||
| use parser::{PREC_CLIMBER, FilterParser}; | ||||
| use pest::error::{Error as PestError, ErrorVariant}; | ||||
| use pest::iterators::{Pair, Pairs}; | ||||
| use pest::Parser; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::FacetValueStringCodec; | ||||
| use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; | ||||
| use crate::{Index, FieldsIdsMap, CboRoaringBitmapCodec}; | ||||
|  | ||||
| use self::FacetCondition::*; | ||||
| use self::FacetNumberOperator::*; | ||||
| use self::parser::Rule; | ||||
|  | ||||
| mod parser; | ||||
|  | ||||
| #[derive(Debug, Copy, Clone, PartialEq)] | ||||
| pub enum FacetNumberOperator<T> { | ||||
|     GreaterThan(T), | ||||
|     GreaterThanOrEqual(T), | ||||
|     Equal(T), | ||||
|     NotEqual(T), | ||||
|     LowerThan(T), | ||||
|     LowerThanOrEqual(T), | ||||
|     Between(T, T), | ||||
| } | ||||
|  | ||||
| impl<T> FacetNumberOperator<T> { | ||||
|     /// This method can return two operations in case it must express | ||||
|     /// an OR operation for the between case (i.e. `TO`). | ||||
|     fn negate(self) -> (Self, Option<Self>) { | ||||
|         match self { | ||||
|             GreaterThan(x)        => (LowerThanOrEqual(x), None), | ||||
|             GreaterThanOrEqual(x) => (LowerThan(x), None), | ||||
|             Equal(x)              => (NotEqual(x), None), | ||||
|             NotEqual(x)           => (Equal(x), None), | ||||
|             LowerThan(x)          => (GreaterThanOrEqual(x), None), | ||||
|             LowerThanOrEqual(x)   => (GreaterThan(x), None), | ||||
|             Between(x, y)         => (LowerThan(x), Some(GreaterThan(y))), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, PartialEq)] | ||||
| pub enum FacetStringOperator { | ||||
|     Equal(String), | ||||
|     NotEqual(String), | ||||
| } | ||||
|  | ||||
| impl FacetStringOperator { | ||||
|     fn negate(self) -> Self { | ||||
|         match self { | ||||
|             FacetStringOperator::Equal(x)    => FacetStringOperator::NotEqual(x), | ||||
|             FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, PartialEq)] | ||||
| pub enum FacetCondition { | ||||
|     OperatorI64(u8, FacetNumberOperator<i64>), | ||||
|     OperatorF64(u8, FacetNumberOperator<f64>), | ||||
|     OperatorString(u8, FacetStringOperator), | ||||
|     Or(Box<Self>, Box<Self>), | ||||
|     And(Box<Self>, Box<Self>), | ||||
| } | ||||
|  | ||||
| fn get_field_id_facet_type<'a>( | ||||
|     fields_ids_map: &FieldsIdsMap, | ||||
|     faceted_fields: &HashMap<u8, FacetType>, | ||||
|     items: &mut Pairs<'a, Rule>, | ||||
| ) -> Result<(u8, FacetType), PestError<Rule>> | ||||
| { | ||||
|     // lexing ensures that we at least have a key | ||||
|     let key = items.next().unwrap(); | ||||
|     let field_id = fields_ids_map | ||||
|         .id(key.as_str()) | ||||
|         .ok_or_else(|| { | ||||
|             PestError::new_from_span( | ||||
|                 ErrorVariant::CustomError { | ||||
|                     message: format!( | ||||
|                         "attribute `{}` not found, available attributes are: {}", | ||||
|                         key.as_str(), | ||||
|                         fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", ") | ||||
|                     ), | ||||
|                 }, | ||||
|                 key.as_span(), | ||||
|             ) | ||||
|         })?; | ||||
|  | ||||
|     let facet_type = faceted_fields | ||||
|         .get(&field_id) | ||||
|         .copied() | ||||
|         .ok_or_else(|| { | ||||
|             PestError::new_from_span( | ||||
|                 ErrorVariant::CustomError { | ||||
|                     message: format!( | ||||
|                         "attribute `{}` is not faceted, available faceted attributes are: {}", | ||||
|                         key.as_str(), | ||||
|                         faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::<Vec<_>>().join(", ") | ||||
|                     ), | ||||
|                 }, | ||||
|                 key.as_span(), | ||||
|             ) | ||||
|         })?; | ||||
|  | ||||
|     Ok((field_id, facet_type)) | ||||
| } | ||||
|  | ||||
| fn pest_parse<T>(pair: Pair<Rule>) -> Result<T, pest::error::Error<Rule>> | ||||
| where T: FromStr, | ||||
|       T::Err: ToString, | ||||
| { | ||||
|     match pair.as_str().parse() { | ||||
|         Ok(value) => Ok(value), | ||||
|         Err(e) => { | ||||
|             Err(PestError::<Rule>::new_from_span( | ||||
|                 ErrorVariant::CustomError { message: e.to_string() }, | ||||
|                 pair.as_span(), | ||||
|             )) | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl FacetCondition { | ||||
|     pub fn from_str( | ||||
|         rtxn: &heed::RoTxn, | ||||
|         index: &Index, | ||||
|         expression: &str, | ||||
|     ) -> anyhow::Result<FacetCondition> | ||||
|     { | ||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||
|         let faceted_fields = index.faceted_fields(rtxn)?; | ||||
|         let lexed = FilterParser::parse(Rule::prgm, expression)?; | ||||
|         FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) | ||||
|     } | ||||
|  | ||||
|     fn from_pairs( | ||||
|         fim: &FieldsIdsMap, | ||||
|         ff: &HashMap<u8, FacetType>, | ||||
|         expression: Pairs<Rule>, | ||||
|     ) -> anyhow::Result<Self> | ||||
|     { | ||||
|         PREC_CLIMBER.climb( | ||||
|             expression, | ||||
|             |pair: Pair<Rule>| match pair.as_rule() { | ||||
|                 Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), | ||||
|                 Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), | ||||
|                 Rule::eq => Ok(Self::equal(fim, ff, pair)?), | ||||
|                 Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), | ||||
|                 Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), | ||||
|                 Rule::less => Ok(Self::lower_than(fim, ff, pair)?), | ||||
|                 Rule::between => Ok(Self::between(fim, ff, pair)?), | ||||
|                 Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), | ||||
|                 Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), | ||||
|                 Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), | ||||
|                 _ => unreachable!(), | ||||
|             }, | ||||
|             |lhs: anyhow::Result<Self>, op: Pair<Rule>, rhs: anyhow::Result<Self>| { | ||||
|                 match op.as_rule() { | ||||
|                     Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), | ||||
|                     Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), | ||||
|                     _ => unreachable!(), | ||||
|                 } | ||||
|             }, | ||||
|         ) | ||||
|     } | ||||
|  | ||||
|     fn negate(self) -> FacetCondition { | ||||
|         match self { | ||||
|             OperatorI64(fid, op) => match op.negate() { | ||||
|                 (op, None) => OperatorI64(fid, op), | ||||
|                 (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), | ||||
|             }, | ||||
|             OperatorF64(fid, op) => match op.negate() { | ||||
|                 (op, None) => OperatorF64(fid, op), | ||||
|                 (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), | ||||
|             }, | ||||
|             OperatorString(fid, op) => OperatorString(fid, op.negate()), | ||||
|             Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), | ||||
|             And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn between( | ||||
|         fields_ids_map: &FieldsIdsMap, | ||||
|         faceted_fields: &HashMap<u8, FacetType>, | ||||
|         item: Pair<Rule>, | ||||
|     ) -> anyhow::Result<FacetCondition> | ||||
|     { | ||||
|         let item_span = item.as_span(); | ||||
|         let mut items = item.into_inner(); | ||||
|         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||
|         let lvalue = items.next().unwrap(); | ||||
|         let rvalue = items.next().unwrap(); | ||||
|         match ftype { | ||||
|             FacetType::Integer => { | ||||
|                 let lvalue = pest_parse(lvalue)?; | ||||
|                 let rvalue = pest_parse(rvalue)?; | ||||
|                 Ok(OperatorI64(fid, Between(lvalue, rvalue))) | ||||
|             }, | ||||
|             FacetType::Float => { | ||||
|                 let lvalue = pest_parse(lvalue)?; | ||||
|                 let rvalue = pest_parse(rvalue)?; | ||||
|                 Ok(OperatorF64(fid, Between(lvalue, rvalue))) | ||||
|             }, | ||||
|             FacetType::String => { | ||||
|                 Err(PestError::<Rule>::new_from_span( | ||||
|                     ErrorVariant::CustomError { | ||||
|                         message: format!("invalid operator on a faceted string"), | ||||
|                     }, | ||||
|                     item_span, | ||||
|                 ).into()) | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn equal( | ||||
|         fields_ids_map: &FieldsIdsMap, | ||||
|         faceted_fields: &HashMap<u8, FacetType>, | ||||
|         item: Pair<Rule>, | ||||
|     ) -> anyhow::Result<FacetCondition> | ||||
|     { | ||||
|         let mut items = item.into_inner(); | ||||
|         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||
|         let value = items.next().unwrap(); | ||||
|         match ftype { | ||||
|             FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))), | ||||
|             FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))), | ||||
|             FacetType::String => { | ||||
|                 Ok(OperatorString(fid, FacetStringOperator::Equal(value.as_str().to_string()))) | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn greater_than( | ||||
|         fields_ids_map: &FieldsIdsMap, | ||||
|         faceted_fields: &HashMap<u8, FacetType>, | ||||
|         item: Pair<Rule>, | ||||
|     ) -> anyhow::Result<FacetCondition> | ||||
|     { | ||||
|         let item_span = item.as_span(); | ||||
|         let mut items = item.into_inner(); | ||||
|         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||
|         let value = items.next().unwrap(); | ||||
|         match ftype { | ||||
|             FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))), | ||||
|             FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))), | ||||
|             FacetType::String => { | ||||
|                 Err(PestError::<Rule>::new_from_span( | ||||
|                     ErrorVariant::CustomError { | ||||
|                         message: format!("invalid operator on a faceted string"), | ||||
|                     }, | ||||
|                     item_span, | ||||
|                 ).into()) | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn greater_than_or_equal( | ||||
|         fields_ids_map: &FieldsIdsMap, | ||||
|         faceted_fields: &HashMap<u8, FacetType>, | ||||
|         item: Pair<Rule>, | ||||
|     ) -> anyhow::Result<FacetCondition> | ||||
|     { | ||||
|         let item_span = item.as_span(); | ||||
|         let mut items = item.into_inner(); | ||||
|         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||
|         let value = items.next().unwrap(); | ||||
|         match ftype { | ||||
|             FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))), | ||||
|             FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))), | ||||
|             FacetType::String => { | ||||
|                 Err(PestError::<Rule>::new_from_span( | ||||
|                     ErrorVariant::CustomError { | ||||
|                         message: format!("invalid operator on a faceted string"), | ||||
|                     }, | ||||
|                     item_span, | ||||
|                 ).into()) | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn lower_than( | ||||
|         fields_ids_map: &FieldsIdsMap, | ||||
|         faceted_fields: &HashMap<u8, FacetType>, | ||||
|         item: Pair<Rule>, | ||||
|     ) -> anyhow::Result<FacetCondition> | ||||
|     { | ||||
|         let item_span = item.as_span(); | ||||
|         let mut items = item.into_inner(); | ||||
|         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||
|         let value = items.next().unwrap(); | ||||
|         match ftype { | ||||
|             FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))), | ||||
|             FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))), | ||||
|             FacetType::String => { | ||||
|                 Err(PestError::<Rule>::new_from_span( | ||||
|                     ErrorVariant::CustomError { | ||||
|                         message: format!("invalid operator on a faceted string"), | ||||
|                     }, | ||||
|                     item_span, | ||||
|                 ).into()) | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn lower_than_or_equal( | ||||
|         fields_ids_map: &FieldsIdsMap, | ||||
|         faceted_fields: &HashMap<u8, FacetType>, | ||||
|         item: Pair<Rule>, | ||||
|     ) -> anyhow::Result<FacetCondition> | ||||
|     { | ||||
|         let item_span = item.as_span(); | ||||
|         let mut items = item.into_inner(); | ||||
|         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||
|         let value = items.next().unwrap(); | ||||
|         match ftype { | ||||
|             FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))), | ||||
|             FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))), | ||||
|             FacetType::String => { | ||||
|                 Err(PestError::<Rule>::new_from_span( | ||||
|                     ErrorVariant::CustomError { | ||||
|                         message: format!("invalid operator on a faceted string"), | ||||
|                     }, | ||||
|                     item_span, | ||||
|                 ).into()) | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl FacetCondition { | ||||
|     /// Aggregates the documents ids that are part of the specified range automatically | ||||
|     /// going deeper through the levels. | ||||
|     fn explore_facet_levels<'t, T: 't, KC>( | ||||
|         rtxn: &'t heed::RoTxn, | ||||
|         db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||
|         field_id: u8, | ||||
|         level: u8, | ||||
|         left: Bound<T>, | ||||
|         right: Bound<T>, | ||||
|         output: &mut RoaringBitmap, | ||||
|     ) -> anyhow::Result<()> | ||||
|     where | ||||
|         T: Copy + PartialEq + PartialOrd + Bounded + Debug, | ||||
|         KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||
|         KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||
|     { | ||||
|         match (left, right) { | ||||
|             // If the request is an exact value we must go directly to the deepest level. | ||||
|             (Included(l), Included(r)) if l == r && level > 0 => { | ||||
|                 return Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, 0, left, right, output); | ||||
|             }, | ||||
|             // lower TO upper when lower > upper must return no result | ||||
|             (Included(l), Included(r)) if l > r => return Ok(()), | ||||
|             (Included(l), Excluded(r)) if l >= r => return Ok(()), | ||||
|             (Excluded(l), Excluded(r)) if l >= r => return Ok(()), | ||||
|             (Excluded(l), Included(r)) if l >= r => return Ok(()), | ||||
|             (_, _) => (), | ||||
|         } | ||||
|  | ||||
|         let mut left_found = None; | ||||
|         let mut right_found = None; | ||||
|  | ||||
|         // We must create a custom iterator to be able to iterate over the | ||||
|         // requested range as the range iterator cannot express some conditions. | ||||
|         let left_bound = match left { | ||||
|             Included(left) => Included((field_id, level, left, T::min_value())), | ||||
|             Excluded(left) => Excluded((field_id, level, left, T::min_value())), | ||||
|             Unbounded => Unbounded, | ||||
|         }; | ||||
|         let right_bound = Included((field_id, level, T::max_value(), T::max_value())); | ||||
|         // We also make sure that we don't decode the data before we are sure we must return it. | ||||
|         let iter = db | ||||
|             .remap_key_type::<KC>() | ||||
|             .lazily_decode_data() | ||||
|             .range(rtxn, &(left_bound, right_bound))? | ||||
|             .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { | ||||
|                 match right { | ||||
|                     Included(right) => *r <= right, | ||||
|                     Excluded(right) => *r < right, | ||||
|                     Unbounded => true, | ||||
|                 } | ||||
|             })) | ||||
|             .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); | ||||
|  | ||||
|         debug!("Iterating between {:?} and {:?} (level {})", left, right, level); | ||||
|  | ||||
|         for (i, result) in iter.enumerate() { | ||||
|             let ((_fid, level, l, r), docids) = result?; | ||||
|             debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); | ||||
|             output.union_with(&docids); | ||||
|             // We save the leftest and rightest bounds we actually found at this level. | ||||
|             if i == 0 { left_found = Some(l); } | ||||
|             right_found = Some(r); | ||||
|         } | ||||
|  | ||||
|         // Can we go deeper? | ||||
|         let deeper_level = match level.checked_sub(1) { | ||||
|             Some(level) => level, | ||||
|             None => return Ok(()), | ||||
|         }; | ||||
|  | ||||
|         // We must refine the left and right bounds of this range by retrieving the | ||||
|         // missing part in a deeper level. | ||||
|         match left_found.zip(right_found) { | ||||
|             Some((left_found, right_found)) => { | ||||
|                 // If the bound is satisfied we avoid calling this function again. | ||||
|                 if !matches!(left, Included(l) if l == left_found) { | ||||
|                     let sub_right = Excluded(left_found); | ||||
|                     debug!("calling left with {:?} to {:?} (level {})",  left, sub_right, deeper_level); | ||||
|                     Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, sub_right, output)?; | ||||
|                 } | ||||
|                 if !matches!(right, Included(r) if r == right_found) { | ||||
|                     let sub_left = Excluded(right_found); | ||||
|                     debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); | ||||
|                     Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, sub_left, right, output)?; | ||||
|                 } | ||||
|             }, | ||||
|             None => { | ||||
|                 // If we found nothing at this level it means that we must find | ||||
|                 // the same bounds but at a deeper, more precise level. | ||||
|                 Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, right, output)?; | ||||
|             }, | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn evaluate_number_operator<'t, T: 't, KC>( | ||||
|         rtxn: &'t heed::RoTxn, | ||||
|         index: &Index, | ||||
|         db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||
|         field_id: u8, | ||||
|         operator: FacetNumberOperator<T>, | ||||
|     ) -> anyhow::Result<RoaringBitmap> | ||||
|     where | ||||
|         T: Copy + PartialEq + PartialOrd + Bounded + Debug, | ||||
|         KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||
|         KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||
|     { | ||||
|         // Make sure we always bound the ranges with the field id and the level, | ||||
|         // as the facets values are all in the same database and prefixed by the | ||||
|         // field id and the level. | ||||
|         let (left, right) = match operator { | ||||
|             GreaterThan(val)        => (Excluded(val),            Included(T::max_value())), | ||||
|             GreaterThanOrEqual(val) => (Included(val),            Included(T::max_value())), | ||||
|             Equal(val)              => (Included(val),            Included(val)), | ||||
|             NotEqual(val)           => { | ||||
|                 let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; | ||||
|                 let docids = Self::evaluate_number_operator::<T, KC>(rtxn, index, db, field_id, Equal(val))?; | ||||
|                 return Ok(all_documents_ids - docids); | ||||
|             }, | ||||
|             LowerThan(val)          => (Included(T::min_value()), Excluded(val)), | ||||
|             LowerThanOrEqual(val)   => (Included(T::min_value()), Included(val)), | ||||
|             Between(left, right)    => (Included(left),           Included(right)), | ||||
|         }; | ||||
|  | ||||
|         // Ask for the biggest value that can exist for this specific field, if it exists | ||||
|         // that's fine if it don't, the value just before will be returned instead. | ||||
|         let biggest_level = db | ||||
|             .remap_types::<KC, DecodeIgnore>() | ||||
|             .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? | ||||
|             .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); | ||||
|  | ||||
|         match biggest_level { | ||||
|             Some(level) => { | ||||
|                 let mut output = RoaringBitmap::new(); | ||||
|                 Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, level, left, right, &mut output)?; | ||||
|                 Ok(output) | ||||
|             }, | ||||
|             None => Ok(RoaringBitmap::new()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn evaluate_string_operator( | ||||
|         rtxn: &heed::RoTxn, | ||||
|         index: &Index, | ||||
|         db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>, | ||||
|         field_id: u8, | ||||
|         operator: &FacetStringOperator, | ||||
|     ) -> anyhow::Result<RoaringBitmap> | ||||
|     { | ||||
|         match operator { | ||||
|             FacetStringOperator::Equal(string) => { | ||||
|                 match db.get(rtxn, &(field_id, string))? { | ||||
|                     Some(docids) => Ok(docids), | ||||
|                     None => Ok(RoaringBitmap::new()) | ||||
|                 } | ||||
|             }, | ||||
|             FacetStringOperator::NotEqual(string) => { | ||||
|                 let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; | ||||
|                 let op = FacetStringOperator::Equal(string.clone()); | ||||
|                 let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; | ||||
|                 return Ok(all_documents_ids - docids); | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn evaluate( | ||||
|         &self, | ||||
|         rtxn: &heed::RoTxn, | ||||
|         index: &Index, | ||||
|     ) -> anyhow::Result<RoaringBitmap> | ||||
|     { | ||||
|         let db = index.facet_field_id_value_docids; | ||||
|         match self { | ||||
|             OperatorI64(fid, op) => { | ||||
|                 Self::evaluate_number_operator::<i64, FacetLevelValueI64Codec>(rtxn, index, db, *fid, *op) | ||||
|             }, | ||||
|             OperatorF64(fid, op) => { | ||||
|                 Self::evaluate_number_operator::<f64, FacetLevelValueF64Codec>(rtxn, index, db, *fid, *op) | ||||
|             }, | ||||
|             OperatorString(fid, op) => { | ||||
|                 let db = db.remap_key_type::<FacetValueStringCodec>(); | ||||
|                 Self::evaluate_string_operator(rtxn, index, db, *fid, op) | ||||
|             }, | ||||
|             Or(lhs, rhs) => { | ||||
|                 let lhs = lhs.evaluate(rtxn, index)?; | ||||
|                 let rhs = rhs.evaluate(rtxn, index)?; | ||||
|                 Ok(lhs | rhs) | ||||
|             }, | ||||
|             And(lhs, rhs) => { | ||||
|                 let lhs = lhs.evaluate(rtxn, index)?; | ||||
|                 let rhs = rhs.evaluate(rtxn, index)?; | ||||
|                 Ok(lhs & rhs) | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|     use crate::update::Settings; | ||||
|     use heed::EnvOpenOptions; | ||||
|     use maplit::hashmap; | ||||
|  | ||||
|     #[test] | ||||
|     fn string() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|  | ||||
|         // Set the faceted fields to be the channel. | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = Settings::new(&mut wtxn, &index); | ||||
|         builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         // Test that the facet condition is correctly generated. | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|         let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); | ||||
|         let expected = OperatorString(1, FacetStringOperator::Equal("ponce".into())); | ||||
|         assert_eq!(condition, expected); | ||||
|  | ||||
|         let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); | ||||
|         let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); | ||||
|         assert_eq!(condition, expected); | ||||
|  | ||||
|         let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); | ||||
|         let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); | ||||
|         assert_eq!(condition, expected); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn i64() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|  | ||||
|         // Set the faceted fields to be the channel. | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = Settings::new(&mut wtxn, &index); | ||||
|         builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         // Test that the facet condition is correctly generated. | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|         let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); | ||||
|         let expected = OperatorI64(1, Between(22, 44)); | ||||
|         assert_eq!(condition, expected); | ||||
|  | ||||
|         let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); | ||||
|         let expected = Or( | ||||
|             Box::new(OperatorI64(1, LowerThan(22))), | ||||
|             Box::new(OperatorI64(1, GreaterThan(44))), | ||||
|         ); | ||||
|         assert_eq!(condition, expected); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn parentheses() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|  | ||||
|         // Set the faceted fields to be the channel. | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = Settings::new(&mut wtxn, &index); | ||||
|         builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order | ||||
|         builder.set_faceted_fields(hashmap!{ | ||||
|             "channel".into() => "string".into(), | ||||
|             "timestamp".into() => "integer".into(), | ||||
|         }); | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         // Test that the facet condition is correctly generated. | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|         let condition = FacetCondition::from_str( | ||||
|             &rtxn, &index, | ||||
|             "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", | ||||
|         ).unwrap(); | ||||
|         let expected = Or( | ||||
|             Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), | ||||
|             Box::new(And( | ||||
|                 Box::new(OperatorI64(1, Between(22, 44))), | ||||
|                 Box::new(OperatorString(0, FacetStringOperator::NotEqual("ponce".into()))), | ||||
|             )) | ||||
|         ); | ||||
|         assert_eq!(condition, expected); | ||||
|  | ||||
|         let condition = FacetCondition::from_str( | ||||
|             &rtxn, &index, | ||||
|             "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", | ||||
|         ).unwrap(); | ||||
|         let expected = Or( | ||||
|             Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), | ||||
|             Box::new(Or( | ||||
|                 Box::new(Or( | ||||
|                     Box::new(OperatorI64(1, LowerThan(22))), | ||||
|                     Box::new(OperatorI64(1, GreaterThan(44))), | ||||
|                 )), | ||||
|                 Box::new(OperatorString(0, FacetStringOperator::Equal("ponce".into()))), | ||||
|             )), | ||||
|         ); | ||||
|         assert_eq!(condition, expected); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										12
									
								
								src/search/facet/parser.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								src/search/facet/parser.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,12 @@ | ||||
| use once_cell::sync::Lazy; | ||||
| use pest::prec_climber::{Operator, Assoc, PrecClimber}; | ||||
|  | ||||
| pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| { | ||||
|     use Assoc::*; | ||||
|     use Rule::*; | ||||
|     pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)]) | ||||
| }); | ||||
|  | ||||
| #[derive(Parser)] | ||||
| #[grammar = "search/facet/grammar.pest"] | ||||
| pub struct FilterParser; | ||||
| @@ -1,5 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::fmt; | ||||
| 
 | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use levenshtein_automata::DFA; | ||||
| @@ -8,17 +9,22 @@ use log::debug; | ||||
| use once_cell::sync::Lazy; | ||||
| use roaring::bitmap::RoaringBitmap; | ||||
| 
 | ||||
| use crate::query_tokens::{QueryTokens, QueryToken}; | ||||
| use crate::mdfs::Mdfs; | ||||
| use crate::query_tokens::{QueryTokens, QueryToken}; | ||||
| use crate::{Index, DocumentId}; | ||||
| 
 | ||||
| pub use self::facet::FacetCondition; | ||||
| 
 | ||||
| // Building these factories is not free.
 | ||||
| static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); | ||||
| static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true)); | ||||
| static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | ||||
| 
 | ||||
| mod facet; | ||||
| 
 | ||||
| pub struct Search<'a> { | ||||
|     query: Option<String>, | ||||
|     facet_condition: Option<FacetCondition>, | ||||
|     offset: usize, | ||||
|     limit: usize, | ||||
|     rtxn: &'a heed::RoTxn<'a>, | ||||
| @@ -27,7 +33,7 @@ pub struct Search<'a> { | ||||
| 
 | ||||
| impl<'a> Search<'a> { | ||||
|     pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { | ||||
|         Search { query: None, offset: 0, limit: 20, rtxn, index } | ||||
|         Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } | ||||
|     } | ||||
| 
 | ||||
|     pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> { | ||||
| @@ -45,6 +51,11 @@ impl<'a> Search<'a> { | ||||
|         self | ||||
|     } | ||||
| 
 | ||||
|     pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { | ||||
|         self.facet_condition = Some(condition); | ||||
|         self | ||||
|     } | ||||
| 
 | ||||
|     /// Extracts the query words from the query string and returns the DFAs accordingly.
 | ||||
|     /// TODO introduce settings for the number of typos regarding the words lengths.
 | ||||
|     fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { | ||||
| @@ -135,22 +146,44 @@ impl<'a> Search<'a> { | ||||
| 
 | ||||
|     pub fn execute(&self) -> anyhow::Result<SearchResult> { | ||||
|         let limit = self.limit; | ||||
| 
 | ||||
|         let fst = self.index.words_fst(self.rtxn)?; | ||||
| 
 | ||||
|         // Construct the DFAs related to the query words.
 | ||||
|         let dfas = match self.query.as_deref().map(Self::generate_query_dfas) { | ||||
|             Some(dfas) if !dfas.is_empty() => dfas, | ||||
|             _ => { | ||||
|         let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { | ||||
|             Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), | ||||
|             _otherwise => None, | ||||
|         }; | ||||
| 
 | ||||
|         // We create the original candidates with the facet conditions results.
 | ||||
|         let facet_candidates = match &self.facet_condition { | ||||
|             Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), | ||||
|             None => None, | ||||
|         }; | ||||
| 
 | ||||
|         debug!("facet candidates: {:?}", facet_candidates); | ||||
| 
 | ||||
|         let (candidates, derived_words) = match (facet_candidates, derived_words) { | ||||
|             (Some(mut facet_candidates), Some(derived_words)) => { | ||||
|                 let words_candidates = Self::compute_candidates(&derived_words); | ||||
|                 facet_candidates.intersect_with(&words_candidates); | ||||
|                 (facet_candidates, derived_words) | ||||
|             }, | ||||
|             (None, Some(derived_words)) => { | ||||
|                 (Self::compute_candidates(&derived_words), derived_words) | ||||
|             }, | ||||
|             (Some(facet_candidates), None) => { | ||||
|                 // If the query is not set or results in no DFAs but
 | ||||
|                 // there is some facet conditions we return a placeholder.
 | ||||
|                 let documents_ids = facet_candidates.iter().take(limit).collect(); | ||||
|                 return Ok(SearchResult { documents_ids, ..Default::default() }) | ||||
|             }, | ||||
|             (None, None) => { | ||||
|                 // If the query is not set or results in no DFAs we return a placeholder.
 | ||||
|                 let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); | ||||
|                 return Ok(SearchResult { documents_ids, ..Default::default() }) | ||||
|             }, | ||||
|         }; | ||||
| 
 | ||||
|         let derived_words = self.fetch_words_docids(&fst, dfas)?; | ||||
|         let candidates = Self::compute_candidates(&derived_words); | ||||
| 
 | ||||
|         debug!("candidates: {:?}", candidates); | ||||
| 
 | ||||
|         // The mana depth first search is a revised DFS that explore
 | ||||
| @@ -175,6 +208,18 @@ impl<'a> Search<'a> { | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl fmt::Debug for Search<'_> { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self; | ||||
|         f.debug_struct("Search") | ||||
|             .field("query", query) | ||||
|             .field("facet_condition", facet_condition) | ||||
|             .field("offset", offset) | ||||
|             .field("limit", limit) | ||||
|             .finish() | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #[derive(Default)] | ||||
| pub struct SearchResult { | ||||
|     pub found_words: HashSet<String>, | ||||
| @@ -1,10 +1,10 @@ | ||||
| use std::path::PathBuf; | ||||
| use std::{str, io}; | ||||
| use std::{str, io, fmt}; | ||||
|  | ||||
| use anyhow::Context; | ||||
| use crate::Index; | ||||
| use heed::EnvOpenOptions; | ||||
| use structopt::StructOpt; | ||||
| use crate::Index; | ||||
|  | ||||
| use Command::*; | ||||
|  | ||||
| @@ -89,6 +89,12 @@ enum Command { | ||||
|         field_name: String, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs some facets statistics for the given facet name. | ||||
|     FacetStats { | ||||
|         /// The field name in the document. | ||||
|         field_name: String, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs the total size of all the docid-word-positions keys and values. | ||||
|     TotalDocidWordPositionsSize, | ||||
|  | ||||
| @@ -165,6 +171,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { | ||||
|         FacetValuesDocids { full_display, field_name } => { | ||||
|             facet_values_docids(&index, &rtxn, !full_display, field_name) | ||||
|         }, | ||||
|         FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), | ||||
|         TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), | ||||
|         AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), | ||||
|         AverageNumberOfPositionsByWord => { | ||||
| @@ -225,46 +232,140 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| /// Helper function that converts the facet value key to a unique type | ||||
| /// that can be used to log or display purposes. | ||||
| fn facet_values_iter<'txn, DC: 'txn, T>( | ||||
|     rtxn: &'txn heed::RoTxn, | ||||
|     db: heed::Database<heed::types::ByteSlice, DC>, | ||||
|     field_id: u8, | ||||
|     facet_type: crate::facet::FacetType, | ||||
|     string_fn: impl Fn(&str) -> T + 'txn, | ||||
|     float_fn: impl Fn(u8, f64, f64) -> T + 'txn, | ||||
|     integer_fn: impl Fn(u8, i64, i64) -> T + 'txn, | ||||
| ) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(T, DC::DItem)>> + 'txn>> | ||||
| where | ||||
|     DC: heed::BytesDecode<'txn>, | ||||
| { | ||||
|     use crate::facet::FacetType; | ||||
|     use crate::heed_codec::facet::{ | ||||
|         FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, | ||||
|     }; | ||||
|  | ||||
|     let iter = db.prefix_iter(&rtxn, &[field_id])?; | ||||
|     match facet_type { | ||||
|         FacetType::String => { | ||||
|             let iter = iter.remap_key_type::<FacetValueStringCodec>() | ||||
|                 .map(move |r| r.map(|((_, key), value)| (string_fn(key), value))); | ||||
|             Ok(Box::new(iter) as Box<dyn Iterator<Item=_>>) | ||||
|         }, | ||||
|         FacetType::Float => { | ||||
|             let iter = iter.remap_key_type::<FacetLevelValueF64Codec>() | ||||
|                 .map(move |r| r.map(|((_, level, left, right), value)| { | ||||
|                     (float_fn(level, left, right), value) | ||||
|                 })); | ||||
|             Ok(Box::new(iter)) | ||||
|         }, | ||||
|         FacetType::Integer => { | ||||
|             let iter = iter.remap_key_type::<FacetLevelValueI64Codec>() | ||||
|                 .map(move |r| r.map(|((_, level, left, right), value)| { | ||||
|                     (integer_fn(level, left, right), value) | ||||
|                 })); | ||||
|             Ok(Box::new(iter)) | ||||
|         }, | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> String { | ||||
|     if level == 0 { | ||||
|         format!("{:?} (level {})", left, level) | ||||
|     } else { | ||||
|         format!("{:?} to {:?} (level {})", left, right, level) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | ||||
|     use std::cmp::Reverse; | ||||
|     use std::collections::BinaryHeap; | ||||
|     use heed::types::{Str, ByteSlice}; | ||||
|     use crate::heed_codec::BEU32StrCodec; | ||||
|  | ||||
|     let Index { | ||||
|         env: _env, | ||||
|         main, | ||||
|         word_docids, | ||||
|         docid_word_positions, | ||||
|         word_pair_proximity_docids, | ||||
|         facet_field_id_value_docids, | ||||
|         documents, | ||||
|     } = index; | ||||
|  | ||||
|     let main_name = "main"; | ||||
|     let word_docids_name = "word_docids"; | ||||
|     let docid_word_positions_name = "docid_word_positions"; | ||||
|     let word_pair_proximity_docids_name = "word_pair_proximity_docids"; | ||||
|     let facet_field_id_value_docids_name = "facet_field_id_value_docids"; | ||||
|     let documents_name = "documents"; | ||||
|  | ||||
|     let mut heap = BinaryHeap::with_capacity(limit + 1); | ||||
|  | ||||
|     if limit > 0 { | ||||
|         let words_fst = index.words_fst(rtxn)?; | ||||
|  | ||||
|         heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name))); | ||||
|         if heap.len() > limit { heap.pop(); } | ||||
|  | ||||
|         if let Some(documents) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents")? { | ||||
|             heap.push(Reverse((documents.len(), format!("documents"), main_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         if let Some(documents_ids) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { | ||||
|         if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { | ||||
|             heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         for result in index.word_docids.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? { | ||||
|         for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||
|             let (word, value) = result?; | ||||
|             heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         for result in index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, ByteSlice>(rtxn)? { | ||||
|         for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||
|             let ((docid, word), value) = result?; | ||||
|             let key = format!("{} {}", docid, word); | ||||
|             heap.push(Reverse((value.len(), key, docid_word_positions_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||
|             let ((word1, word2, prox), value) = result?; | ||||
|             let key = format!("{} {} {}", word1, word2, prox); | ||||
|             heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         let faceted_fields = index.faceted_fields(rtxn)?; | ||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||
|         for (field_id, field_type) in faceted_fields { | ||||
|             let facet_name = fields_ids_map.name(field_id).unwrap(); | ||||
|  | ||||
|             let db = facet_field_id_value_docids.remap_data_type::<ByteSlice>(); | ||||
|             let iter = facet_values_iter( | ||||
|                 rtxn, | ||||
|                 db, | ||||
|                 field_id, | ||||
|                 field_type, | ||||
|                 |key| key.to_owned(), | ||||
|                 facet_number_value_to_string, | ||||
|                 facet_number_value_to_string, | ||||
|             )?; | ||||
|  | ||||
|             for result in iter { | ||||
|                 let (fvalue, value) = result?; | ||||
|                 let key = format!("{} {}", facet_name, fvalue); | ||||
|                 heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); | ||||
|                 if heap.len() > limit { heap.pop(); } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||
|             let (id, value) = result?; | ||||
|             heap.push(Reverse((value.len(), id.to_string(), documents_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let stdout = io::stdout(); | ||||
| @@ -298,10 +399,6 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<Strin | ||||
| } | ||||
|  | ||||
| fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { | ||||
|     use crate::facet::FacetType; | ||||
|     use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; | ||||
|     use heed::{BytesDecode, Error::Decoding}; | ||||
|  | ||||
|     let fields_ids_map = index.fields_ids_map(&rtxn)?; | ||||
|     let faceted_fields = index.faceted_fields(&rtxn)?; | ||||
|  | ||||
| @@ -310,51 +407,78 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam | ||||
|     let field_type = faceted_fields.get(&field_id) | ||||
|         .with_context(|| format!("field {} is not faceted", field_name))?; | ||||
|  | ||||
|     let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; | ||||
|     let iter = match field_type { | ||||
|         FacetType::String => { | ||||
|             let iter = iter | ||||
|                 .map(|result| result.and_then(|(key, value)| { | ||||
|                     let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?; | ||||
|                     Ok((key.to_string(), value)) | ||||
|                 })); | ||||
|             Box::new(iter) as Box<dyn Iterator<Item=_>> | ||||
|         }, | ||||
|         FacetType::Float => { | ||||
|             let iter = iter | ||||
|                 .map(|result| result.and_then(|(key, value)| { | ||||
|                     let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?; | ||||
|                     Ok((key.to_string(), value)) | ||||
|                 })); | ||||
|             Box::new(iter) | ||||
|         }, | ||||
|         FacetType::Integer => { | ||||
|             let iter = iter | ||||
|                 .map(|result| result.and_then(|(key, value)| { | ||||
|                     let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?; | ||||
|                     Ok((key.to_string(), value)) | ||||
|                 })); | ||||
|             Box::new(iter) | ||||
|         }, | ||||
|     }; | ||||
|  | ||||
|     let stdout = io::stdout(); | ||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||
|     wtr.write_record(&["facet_value", "documents_ids"])?; | ||||
|     wtr.write_record(&["facet_value", "documents_count", "documents_ids"])?; | ||||
|  | ||||
|     let db = index.facet_field_id_value_docids; | ||||
|     let iter = facet_values_iter( | ||||
|         rtxn, | ||||
|         db, | ||||
|         field_id, | ||||
|         *field_type, | ||||
|         |key| key.to_owned(), | ||||
|         facet_number_value_to_string, | ||||
|         facet_number_value_to_string, | ||||
|     )?; | ||||
|  | ||||
|     for result in iter { | ||||
|         let (value, docids) = result?; | ||||
|         let count = docids.len(); | ||||
|         let docids = if debug { | ||||
|             format!("{:?}", docids) | ||||
|         } else { | ||||
|             format!("{:?}", docids.iter().collect::<Vec<_>>()) | ||||
|         }; | ||||
|         wtr.write_record(&[value, docids])?; | ||||
|         wtr.write_record(&[value, count.to_string(), docids])?; | ||||
|     } | ||||
|  | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { | ||||
|     let fields_ids_map = index.fields_ids_map(&rtxn)?; | ||||
|     let faceted_fields = index.faceted_fields(&rtxn)?; | ||||
|  | ||||
|     let field_id = fields_ids_map.id(&field_name) | ||||
|         .with_context(|| format!("field {} not found", field_name))?; | ||||
|     let field_type = faceted_fields.get(&field_id) | ||||
|         .with_context(|| format!("field {} is not faceted", field_name))?; | ||||
|  | ||||
|     let db = index.facet_field_id_value_docids; | ||||
|     let iter = facet_values_iter( | ||||
|         rtxn, | ||||
|         db, | ||||
|         field_id, | ||||
|         *field_type, | ||||
|         |_key| 0u8, | ||||
|         |level, _left, _right| level, | ||||
|         |level, _left, _right| level, | ||||
|     )?; | ||||
|  | ||||
|     println!("The database {:?} facet stats", field_name); | ||||
|  | ||||
|     let mut level_size = 0; | ||||
|     let mut current_level = None; | ||||
|     for result in iter { | ||||
|         let (level, _) = result?; | ||||
|         if let Some(current) = current_level { | ||||
|             if current != level { | ||||
|                 println!("\tnumber of groups at level {}: {}", current, level_size); | ||||
|                 level_size = 0; | ||||
|             } | ||||
|         } | ||||
|         current_level = Some(level); | ||||
|         level_size += 1; | ||||
|     } | ||||
|  | ||||
|     if let Some(current) = current_level { | ||||
|         println!("\tnumber of groups at level {}: {}", current, level_size); | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { | ||||
|     use std::fs::File; | ||||
|     use std::io::Write as _; | ||||
|   | ||||
| @@ -24,12 +24,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|  | ||||
|         // We retrieve the number of documents ids that we are deleting. | ||||
|         let number_of_documents = self.index.number_of_documents(self.wtxn)?; | ||||
|         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||
|  | ||||
|         // We clean some of the main engine datastructures. | ||||
|         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; | ||||
|         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; | ||||
|         self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; | ||||
|  | ||||
|         // We clean all the faceted documents ids. | ||||
|         for (field_id, _) in faceted_fields { | ||||
|             self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; | ||||
|         } | ||||
|  | ||||
|         // Clear the other databases. | ||||
|         word_docids.clear(self.wtxn)?; | ||||
|         docid_word_positions.clear(self.wtxn)?; | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| use fst::IntoStreamer; | ||||
| use heed::types::ByteSlice; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; | ||||
| @@ -132,11 +133,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; | ||||
|             if let Some((key, mut docids)) = iter.next().transpose()? { | ||||
|                 if key == word.as_ref() { | ||||
|                     let previous_len = docids.len(); | ||||
|                     docids.difference_with(&self.documents_ids); | ||||
|                     if docids.is_empty() { | ||||
|                         iter.del_current()?; | ||||
|                         *must_remove = true; | ||||
|                     } else { | ||||
|                     } else if docids.len() != previous_len { | ||||
|                         iter.put_current(key, &docids)?; | ||||
|                     } | ||||
|                 } | ||||
| @@ -168,27 +170,37 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|         // We delete the documents ids that are under the pairs of words, | ||||
|         // it is faster and use no memory to iterate over all the words pairs than | ||||
|         // to compute the cartesian product of every words of the deleted documents. | ||||
|         let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?; | ||||
|         let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?; | ||||
|         while let Some(result) = iter.next() { | ||||
|             let ((w1, w2, prox), mut docids) = result?; | ||||
|             let (bytes, mut docids) = result?; | ||||
|             let previous_len = docids.len(); | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             if docids.is_empty() { | ||||
|                 iter.del_current()?; | ||||
|             } else { | ||||
|                 iter.put_current(&(w1, w2, prox), &docids)?; | ||||
|             } else if docids.len() != previous_len { | ||||
|                 iter.put_current(bytes, &docids)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         // Remove the documents ids from the faceted documents ids. | ||||
|         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||
|         for (field_id, _) in faceted_fields { | ||||
|             let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; | ||||
|         } | ||||
|  | ||||
|         // We delete the documents ids that are under the facet field id values. | ||||
|         let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; | ||||
|         while let Some(result) = iter.next() { | ||||
|             let (bytes, mut docids) = result?; | ||||
|             let previous_len = docids.len(); | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             if docids.is_empty() { | ||||
|                 iter.del_current()?; | ||||
|             } else { | ||||
|             } else if docids.len() != previous_len { | ||||
|                 iter.put_current(bytes, &docids)?; | ||||
|             } | ||||
|         } | ||||
|   | ||||
							
								
								
									
										256
									
								
								src/update/facets.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										256
									
								
								src/update/facets.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,256 @@ | ||||
| use std::cmp; | ||||
| use std::fs::File; | ||||
| use std::num::NonZeroUsize; | ||||
|  | ||||
| use grenad::{CompressionType, Reader, Writer, FileFuse}; | ||||
| use heed::types::{ByteSlice, DecodeIgnore}; | ||||
| use heed::{BytesEncode, Error}; | ||||
| use log::debug; | ||||
| use num_traits::{Bounded, Zero}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; | ||||
| use crate::Index; | ||||
| use crate::update::index_documents::WriteMethod; | ||||
| use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; | ||||
|  | ||||
| pub struct Facets<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     level_group_size: NonZeroUsize, | ||||
|     min_level_size: NonZeroUsize, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { | ||||
|         Facets { | ||||
|             wtxn, | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             level_group_size: NonZeroUsize::new(4).unwrap(), | ||||
|             min_level_size: NonZeroUsize::new(5).unwrap(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { | ||||
|         self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { | ||||
|         self.min_level_size = value; | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self) -> anyhow::Result<()> { | ||||
|         // We get the faceted fields to be able to create the facet levels. | ||||
|         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||
|  | ||||
|         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); | ||||
|         for (field_id, facet_type) in faceted_fields { | ||||
|             let (content, documents_ids) = match facet_type { | ||||
|                 FacetType::Integer => { | ||||
|                     clear_field_levels::<i64, FacetLevelValueI64Codec>( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     let documents_ids = compute_faceted_documents_ids( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     let content = compute_facet_levels::<i64, FacetLevelValueI64Codec>( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         self.chunk_compression_type, | ||||
|                         self.chunk_compression_level, | ||||
|                         self.chunk_fusing_shrink_size, | ||||
|                         self.level_group_size, | ||||
|                         self.min_level_size, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     (Some(content), documents_ids) | ||||
|                 }, | ||||
|                 FacetType::Float => { | ||||
|                     clear_field_levels::<f64, FacetLevelValueF64Codec>( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     let documents_ids = compute_faceted_documents_ids( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     let content = compute_facet_levels::<f64, FacetLevelValueF64Codec>( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         self.chunk_compression_type, | ||||
|                         self.chunk_compression_level, | ||||
|                         self.chunk_fusing_shrink_size, | ||||
|                         self.level_group_size, | ||||
|                         self.min_level_size, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     (Some(content), documents_ids) | ||||
|                 }, | ||||
|                 FacetType::String => { | ||||
|                     let documents_ids = compute_faceted_documents_ids( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     (None, documents_ids) | ||||
|                 }, | ||||
|             }; | ||||
|  | ||||
|             if let Some(content) = content { | ||||
|                 write_into_lmdb_database( | ||||
|                     self.wtxn, | ||||
|                     *self.index.facet_field_id_value_docids.as_polymorph(), | ||||
|                     content, | ||||
|                     |_, _| anyhow::bail!("invalid facet level merging"), | ||||
|                     WriteMethod::GetMergePut, | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn clear_field_levels<'t, T: 't, KC>( | ||||
|     wtxn: &'t mut heed::RwTxn, | ||||
|     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||
|     field_id: u8, | ||||
| ) -> heed::Result<()> | ||||
| where | ||||
|     T: Copy + Bounded, | ||||
|     KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||
|     KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||
| { | ||||
|     let left = (field_id, 1, T::min_value(), T::min_value()); | ||||
|     let right = (field_id, u8::MAX, T::max_value(), T::max_value()); | ||||
|     let range = left..=right; | ||||
|     db.remap_key_type::<KC>().delete_range(wtxn, &range).map(drop) | ||||
| } | ||||
|  | ||||
| fn compute_facet_levels<'t, T: 't, KC>( | ||||
|     rtxn: &'t heed::RoTxn, | ||||
|     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     shrink_size: Option<u64>, | ||||
|     level_group_size: NonZeroUsize, | ||||
|     min_level_size: NonZeroUsize, | ||||
|     field_id: u8, | ||||
| ) -> anyhow::Result<Reader<FileFuse>> | ||||
| where | ||||
|     T: Copy + PartialEq + PartialOrd + Bounded + Zero, | ||||
|     KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||
|     KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||
| { | ||||
|     let first_level_size = db.prefix_iter(rtxn, &[field_id])? | ||||
|         .remap_types::<DecodeIgnore, DecodeIgnore>() | ||||
|         .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; | ||||
|  | ||||
|     // It is forbidden to keep a cursor and write in a database at the same time with LMDB | ||||
|     // therefore we write the facet levels entries into a grenad file before transfering them. | ||||
|     let mut writer = tempfile::tempfile().and_then(|file| { | ||||
|         create_writer(compression_type, compression_level, file) | ||||
|     })?; | ||||
|  | ||||
|     let level_0_range = { | ||||
|         let left = (field_id, 0, T::min_value(), T::min_value()); | ||||
|         let right = (field_id, 0, T::max_value(), T::max_value()); | ||||
|         left..=right | ||||
|     }; | ||||
|  | ||||
|     // Groups sizes are always a power of the original level_group_size and therefore a group | ||||
|     // always maps groups of the previous level and never splits previous levels groups in half. | ||||
|     let group_size_iter = (1u8..) | ||||
|         .map(|l| (l, level_group_size.get().pow(l as u32))) | ||||
|         .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); | ||||
|  | ||||
|     for (level, group_size) in group_size_iter { | ||||
|         let mut left = T::zero(); | ||||
|         let mut right = T::zero(); | ||||
|         let mut group_docids = RoaringBitmap::new(); | ||||
|  | ||||
|         let db = db.remap_key_type::<KC>(); | ||||
|         for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { | ||||
|             let ((_field_id, _level, value, _right), docids) = result?; | ||||
|  | ||||
|             if i == 0 { | ||||
|                 left = value; | ||||
|             } else if i % group_size == 0 { | ||||
|                 // we found the first bound of the next group, we must store the left | ||||
|                 // and right bounds associated with the docids. | ||||
|                 write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?; | ||||
|  | ||||
|                 // We save the left bound for the new group and also reset the docids. | ||||
|                 group_docids = RoaringBitmap::new(); | ||||
|                 left = value; | ||||
|             } | ||||
|  | ||||
|             // The right bound is always the bound we run through. | ||||
|             group_docids.union_with(&docids); | ||||
|             right = value; | ||||
|         } | ||||
|  | ||||
|         if !group_docids.is_empty() { | ||||
|             write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer_into_reader(writer, shrink_size) | ||||
| } | ||||
|  | ||||
| fn compute_faceted_documents_ids( | ||||
|     rtxn: &heed::RoTxn, | ||||
|     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||
|     field_id: u8, | ||||
| ) -> anyhow::Result<RoaringBitmap> | ||||
| { | ||||
|     let mut documents_ids = RoaringBitmap::new(); | ||||
|     for result in db.prefix_iter(rtxn, &[field_id])? { | ||||
|         let (_key, docids) = result?; | ||||
|         documents_ids.union_with(&docids); | ||||
|     } | ||||
|     Ok(documents_ids) | ||||
| } | ||||
|  | ||||
| fn write_entry<T, KC>( | ||||
|     writer: &mut Writer<File>, | ||||
|     field_id: u8, | ||||
|     level: u8, | ||||
|     left: T, | ||||
|     right: T, | ||||
|     ids: &RoaringBitmap, | ||||
| ) -> anyhow::Result<()> | ||||
| where | ||||
|     KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||
| { | ||||
|     let key = (field_id, level, left, right); | ||||
|     let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||
|     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; | ||||
|     writer.insert(&key, &data)?; | ||||
|     Ok(()) | ||||
| } | ||||
| @@ -2,6 +2,7 @@ use std::borrow::Cow; | ||||
| use std::collections::HashSet; | ||||
| use std::fs::File; | ||||
| use std::io::{self, Seek, SeekFrom}; | ||||
| use std::num::NonZeroUsize; | ||||
| use std::sync::mpsc::sync_channel; | ||||
| use std::time::Instant; | ||||
|  | ||||
| @@ -15,7 +16,7 @@ use rayon::prelude::*; | ||||
| use rayon::ThreadPool; | ||||
|  | ||||
| use crate::index::Index; | ||||
| use crate::update::UpdateIndexingStep; | ||||
| use crate::update::{Facets, UpdateIndexingStep}; | ||||
| use self::store::{Store, Readers}; | ||||
| use self::merge_function::{ | ||||
|     main_merge, word_docids_merge, words_pairs_proximities_docids_merge, | ||||
| @@ -31,12 +32,12 @@ mod store; | ||||
| mod transform; | ||||
|  | ||||
| #[derive(Debug, Copy, Clone)] | ||||
| enum WriteMethod { | ||||
| pub enum WriteMethod { | ||||
|     Append, | ||||
|     GetMergePut, | ||||
| } | ||||
|  | ||||
| fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> { | ||||
| pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> { | ||||
|     let mut builder = Writer::builder(); | ||||
|     builder.compression_type(typ); | ||||
|     if let Some(level) = level { | ||||
| @@ -45,7 +46,7 @@ fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Re | ||||
|     builder.build(file) | ||||
| } | ||||
|  | ||||
| fn create_sorter( | ||||
| pub fn create_sorter( | ||||
|     merge: MergeFn, | ||||
|     chunk_compression_type: CompressionType, | ||||
|     chunk_compression_level: Option<u32>, | ||||
| @@ -71,7 +72,7 @@ fn create_sorter( | ||||
|     builder.build() | ||||
| } | ||||
|  | ||||
| fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> { | ||||
| pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> { | ||||
|     let mut file = writer.into_inner()?; | ||||
|     file.seek(SeekFrom::Start(0))?; | ||||
|     let file = if let Some(shrink_size) = shrink_size { | ||||
| @@ -82,13 +83,13 @@ fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow: | ||||
|     Reader::new(file).map_err(Into::into) | ||||
| } | ||||
|  | ||||
| fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> { | ||||
| pub fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> { | ||||
|     let mut builder = Merger::builder(merge); | ||||
|     builder.extend(sources); | ||||
|     builder.build() | ||||
| } | ||||
|  | ||||
| fn merge_into_lmdb_database( | ||||
| pub fn merge_into_lmdb_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     sources: Vec<Reader<FileFuse>>, | ||||
| @@ -132,7 +133,7 @@ fn merge_into_lmdb_database( | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn write_into_lmdb_database( | ||||
| pub fn write_into_lmdb_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     mut reader: Reader<FileFuse>, | ||||
| @@ -157,7 +158,7 @@ fn write_into_lmdb_database( | ||||
|                 match iter.next().transpose()? { | ||||
|                     Some((key, old_val)) if key == k => { | ||||
|                         let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; | ||||
|                         let val = merge(k, &vals).expect("merge failed"); | ||||
|                         let val = merge(k, &vals)?; | ||||
|                         iter.put_current(k, &val)?; | ||||
|                     }, | ||||
|                     _ => { | ||||
| @@ -207,6 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) thread_pool: Option<&'a ThreadPool>, | ||||
|     facet_level_group_size: Option<NonZeroUsize>, | ||||
|     facet_min_level_size: Option<NonZeroUsize>, | ||||
|     update_method: IndexDocumentsMethod, | ||||
|     update_format: UpdateFormat, | ||||
|     autogenerate_docids: bool, | ||||
| @@ -225,6 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             thread_pool: None, | ||||
|             facet_level_group_size: None, | ||||
|             facet_min_level_size: None, | ||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||
|             update_format: UpdateFormat::Json, | ||||
|             autogenerate_docids: true, | ||||
| @@ -308,8 +313,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                 thread_pool: self.thread_pool, | ||||
|             }; | ||||
|             let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; | ||||
|             debug!("documents to delete {:?}", replaced_documents_ids); | ||||
|             deletion_builder.delete_documents(&replaced_documents_ids); | ||||
|             let _deleted_documents_count = deletion_builder.execute()?; | ||||
|             let deleted_documents_count = deletion_builder.execute()?; | ||||
|             debug!("{} documents actually deleted", deleted_documents_count); | ||||
|         } | ||||
|  | ||||
|         let mmap; | ||||
| @@ -327,7 +334,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         enum DatabaseType { | ||||
|             Main, | ||||
|             WordDocids, | ||||
|             FacetValuesDocids, | ||||
|             FacetLevel0ValuesDocids, | ||||
|         } | ||||
|  | ||||
|         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||
| @@ -427,7 +434,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                     (DatabaseType::Main, main_readers, main_merge as MergeFn), | ||||
|                     (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), | ||||
|                     ( | ||||
|                         DatabaseType::FacetValuesDocids, | ||||
|                         DatabaseType::FacetLevel0ValuesDocids, | ||||
|                         facet_field_value_docids_readers, | ||||
|                         facet_field_value_docids_merge, | ||||
|                     ), | ||||
| @@ -557,7 +564,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 }, | ||||
|                 DatabaseType::FacetValuesDocids => { | ||||
|                 DatabaseType::FacetLevel0ValuesDocids => { | ||||
|                     debug!("Writing the facet values docids into LMDB on disk..."); | ||||
|                     let db = *self.index.facet_field_id_value_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
| @@ -577,6 +584,18 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             }); | ||||
|         } | ||||
|  | ||||
|         let mut builder = Facets::new(self.wtxn, self.index); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         if let Some(value) = self.facet_level_group_size { | ||||
|             builder.level_group_size(value); | ||||
|         } | ||||
|         if let Some(value) = self.facet_min_level_size { | ||||
|             builder.min_level_size(value); | ||||
|         } | ||||
|         builder.execute()?; | ||||
|  | ||||
|         debug_assert_eq!(database_count, total_databases); | ||||
|  | ||||
|         info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); | ||||
|   | ||||
| @@ -19,7 +19,7 @@ use tempfile::tempfile; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; | ||||
| use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; | ||||
| use crate::tokenizer::{simple_tokenizer, only_token}; | ||||
| use crate::update::UpdateIndexingStep; | ||||
| use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; | ||||
| @@ -337,8 +337,8 @@ impl Store { | ||||
|         for ((field_id, value), docids) in iter { | ||||
|             let result = match value { | ||||
|                 String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), | ||||
|                 Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned), | ||||
|                 Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned), | ||||
|                 Float(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), | ||||
|                 Integer(i) => FacetLevelValueI64Codec::bytes_encode(&(field_id, 0, i, i)).map(Cow::into_owned), | ||||
|             }; | ||||
|             let key = result.context("could not serialize facet key")?; | ||||
|             let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) | ||||
| @@ -399,7 +399,7 @@ impl Store { | ||||
|             // We skip documents that must not be indexed by this thread. | ||||
|             if count % num_threads == thread_index { | ||||
|                 // This is a log routine that we do every `log_every_n` documents. | ||||
|                 if log_every_n.map_or(false, |len| count % len == 0) { | ||||
|                 if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { | ||||
|                     info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); | ||||
|                     progress_callback(UpdateIndexingStep::IndexDocuments { | ||||
|                         documents_seen: count, | ||||
| @@ -571,7 +571,10 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec | ||||
|             Value::Null => Ok(()), | ||||
|             Value::Bool(b) => Ok(output.push(Integer(*b as i64))), | ||||
|             Value::Number(number) => match ftype { | ||||
|                 FacetType::String => bail!("invalid facet type, expecting {} found number", ftype), | ||||
|                 FacetType::String => { | ||||
|                     let string = SmallString32::from(number.to_string()); | ||||
|                     Ok(output.push(String(string))) | ||||
|                 }, | ||||
|                 FacetType::Float => match number.as_f64() { | ||||
|                     Some(float) => Ok(output.push(Float(OrderedFloat(float)))), | ||||
|                     None => bail!("invalid facet type, expecting {} found integer", ftype), | ||||
| @@ -586,7 +589,7 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec | ||||
|                 }, | ||||
|             }, | ||||
|             Value::String(string) => { | ||||
|                 let string = string.trim(); | ||||
|                 let string = string.trim().to_lowercase(); | ||||
|                 if string.is_empty() { return Ok(()) } | ||||
|                 match ftype { | ||||
|                     FacetType::String => { | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| mod available_documents_ids; | ||||
| mod clear_documents; | ||||
| mod delete_documents; | ||||
| mod facets; | ||||
| mod index_documents; | ||||
| mod settings; | ||||
| mod update_builder; | ||||
| @@ -11,6 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; | ||||
| pub use self::clear_documents::ClearDocuments; | ||||
| pub use self::delete_documents::DeleteDocuments; | ||||
| pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; | ||||
| pub use self::facets::Facets; | ||||
| pub use self::settings::Settings; | ||||
| pub use self::update_builder::UpdateBuilder; | ||||
| pub use self::update_step::UpdateIndexingStep; | ||||
|   | ||||
| @@ -412,6 +412,23 @@ mod tests { | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|         let fields_ids = index.faceted_fields(&rtxn).unwrap(); | ||||
|         assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); | ||||
|         // Only count the field_id 0 and level 0 facet values. | ||||
|         let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); | ||||
|         assert_eq!(count, 3); | ||||
|         drop(rtxn); | ||||
|  | ||||
|         // Index a little more documents with new and current facets values. | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index); | ||||
|         builder.update_format(UpdateFormat::Csv); | ||||
|         builder.execute(content, |_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|         // Only count the field_id 0 and level 0 facet values. | ||||
|         let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); | ||||
|         assert_eq!(count, 4); | ||||
|         drop(rtxn); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -2,7 +2,7 @@ use grenad::CompressionType; | ||||
| use rayon::ThreadPool; | ||||
|  | ||||
| use crate::Index; | ||||
| use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings}; | ||||
| use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; | ||||
|  | ||||
| pub struct UpdateBuilder<'a> { | ||||
|     pub(crate) log_every_n: Option<usize>, | ||||
| @@ -118,4 +118,19 @@ impl<'a> UpdateBuilder<'a> { | ||||
|  | ||||
|         builder | ||||
|     } | ||||
|  | ||||
|     pub fn facets<'t, 'u, 'i>( | ||||
|         self, | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|     ) -> Facets<'t, 'u, 'i> | ||||
|     { | ||||
|         let mut builder = Facets::new(wtxn, index); | ||||
|  | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|  | ||||
|         builder | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user