mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	format the whole project
This commit is contained in:
		
							
								
								
									
										5
									
								
								.rustfmt.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								.rustfmt.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | |||||||
|  | unstable_features = true | ||||||
|  |  | ||||||
|  | use_small_heuristics = "max" | ||||||
|  | imports_granularity = "Module" | ||||||
|  | group_imports = "StdExternalCrate" | ||||||
							
								
								
									
										15
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								README.md
									
									
									
									
									
								
							| @@ -41,3 +41,18 @@ the `content-type:application/json` and `content-type:application/x-ndjson` head | |||||||
| ### Querying the engine via the website | ### Querying the engine via the website | ||||||
|  |  | ||||||
| You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). | You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ## Contributing | ||||||
|  |  | ||||||
|  | You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if: | ||||||
|  | - Any of the workspaces does not build | ||||||
|  | - Your code is not well-formatted | ||||||
|  |  | ||||||
|  | These two things are also checked in the CI, so ignoring the hook won't help you merge your code. | ||||||
|  | But if you need to, you can still add `--no-verify` when creating your commit to ignore the hook. | ||||||
|  |  | ||||||
|  | To enable the hook, run the following command from the root of the project: | ||||||
|  | ``` | ||||||
|  | cp script/pre-commit .git/hooks/pre-commit | ||||||
|  | ``` | ||||||
|   | |||||||
| @@ -6,33 +6,24 @@ use milli::update::Settings; | |||||||
| use utils::Conf; | use utils::Conf; | ||||||
|  |  | ||||||
| fn base_conf(builder: &mut Settings) { | fn base_conf(builder: &mut Settings) { | ||||||
|     let displayed_fields = [ |     let displayed_fields = | ||||||
|         "id", "title", "album", "artist", "genre", "country", "released", "duration", |         ["id", "title", "album", "artist", "genre", "country", "released", "duration"] | ||||||
|     ] |             .iter() | ||||||
|     .iter() |             .map(|s| s.to_string()) | ||||||
|     .map(|s| s.to_string()) |             .collect(); | ||||||
|     .collect(); |  | ||||||
|     builder.set_displayed_fields(displayed_fields); |     builder.set_displayed_fields(displayed_fields); | ||||||
|  |  | ||||||
|     let searchable_fields = ["title", "album", "artist"] |     let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); | ||||||
|  |     builder.set_searchable_fields(searchable_fields); | ||||||
|  |  | ||||||
|  |     let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] | ||||||
|         .iter() |         .iter() | ||||||
|         .map(|s| s.to_string()) |         .map(|s| s.to_string()) | ||||||
|         .collect(); |         .collect(); | ||||||
|     builder.set_searchable_fields(searchable_fields); |  | ||||||
|  |  | ||||||
|     let faceted_fields = [ |  | ||||||
|         "released-timestamp", |  | ||||||
|         "duration-float", |  | ||||||
|         "genre", |  | ||||||
|         "country", |  | ||||||
|         "artist", |  | ||||||
|     ] |  | ||||||
|     .iter() |  | ||||||
|     .map(|s| s.to_string()) |  | ||||||
|     .collect(); |  | ||||||
|     builder.set_filterable_fields(faceted_fields); |     builder.set_filterable_fields(faceted_fields); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[rustfmt::skip] | ||||||
| const BASE_CONF: Conf = Conf { | const BASE_CONF: Conf = Conf { | ||||||
|     dataset: datasets_paths::SMOL_SONGS, |     dataset: datasets_paths::SMOL_SONGS, | ||||||
|     queries: &[ |     queries: &[ | ||||||
| @@ -53,34 +44,25 @@ const BASE_CONF: Conf = Conf { | |||||||
| }; | }; | ||||||
|  |  | ||||||
| fn bench_songs(c: &mut criterion::Criterion) { | fn bench_songs(c: &mut criterion::Criterion) { | ||||||
|     let default_criterion: Vec<String> = milli::default_criteria() |     let default_criterion: Vec<String> = | ||||||
|         .iter() |         milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); | ||||||
|         .map(|criteria| criteria.to_string()) |  | ||||||
|         .collect(); |  | ||||||
|     let default_criterion = default_criterion.iter().map(|s| s.as_str()); |     let default_criterion = default_criterion.iter().map(|s| s.as_str()); | ||||||
|     let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") |     let asc_default: Vec<&str> = | ||||||
|         .chain(default_criterion.clone()) |         std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); | ||||||
|         .collect(); |     let desc_default: Vec<&str> = | ||||||
|     let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") |         std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); | ||||||
|         .chain(default_criterion.clone()) |  | ||||||
|         .collect(); |  | ||||||
|  |  | ||||||
|     let basic_with_quote: Vec<String> = BASE_CONF |     let basic_with_quote: Vec<String> = BASE_CONF | ||||||
|         .queries |         .queries | ||||||
|         .iter() |         .iter() | ||||||
|         .map(|s| { |         .map(|s| { | ||||||
|             s.trim() |             s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ") | ||||||
|                 .split(' ') |  | ||||||
|                 .map(|s| format!(r#""{}""#, s)) |  | ||||||
|                 .collect::<Vec<String>>() |  | ||||||
|                 .join(" ") |  | ||||||
|         }) |         }) | ||||||
|         .collect(); |         .collect(); | ||||||
|     let basic_with_quote: &[&str] = &basic_with_quote |     let basic_with_quote: &[&str] = | ||||||
|         .iter() |         &basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>(); | ||||||
|         .map(|s| s.as_str()) |  | ||||||
|         .collect::<Vec<&str>>(); |  | ||||||
|  |  | ||||||
|  |     #[rustfmt::skip] | ||||||
|     let confs = &[ |     let confs = &[ | ||||||
|         /* first we bench each criterion alone */ |         /* first we bench each criterion alone */ | ||||||
|         utils::Conf { |         utils::Conf { | ||||||
|   | |||||||
| @@ -3,10 +3,8 @@ use std::path::Path; | |||||||
|  |  | ||||||
| use criterion::BenchmarkId; | use criterion::BenchmarkId; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use milli::{ | use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}; | ||||||
|     update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, | use milli::{FilterCondition, Index}; | ||||||
|     FilterCondition, Index, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| pub struct Conf<'a> { | pub struct Conf<'a> { | ||||||
|     /// where we are going to create our database.mmdb directory |     /// where we are going to create our database.mmdb directory | ||||||
|   | |||||||
| @@ -6,16 +6,14 @@ use milli::update::Settings; | |||||||
| use utils::Conf; | use utils::Conf; | ||||||
|  |  | ||||||
| fn base_conf(builder: &mut Settings) { | fn base_conf(builder: &mut Settings) { | ||||||
|     let displayed_fields = ["title", "body", "url"] |     let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); | ||||||
|         .iter() |  | ||||||
|         .map(|s| s.to_string()) |  | ||||||
|         .collect(); |  | ||||||
|     builder.set_displayed_fields(displayed_fields); |     builder.set_displayed_fields(displayed_fields); | ||||||
|  |  | ||||||
|     let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); |     let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); | ||||||
|     builder.set_searchable_fields(searchable_fields); |     builder.set_searchable_fields(searchable_fields); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[rustfmt::skip] | ||||||
| const BASE_CONF: Conf = Conf { | const BASE_CONF: Conf = Conf { | ||||||
|     dataset: datasets_paths::SMOL_WIKI_ARTICLES, |     dataset: datasets_paths::SMOL_WIKI_ARTICLES, | ||||||
|     queries: &[ |     queries: &[ | ||||||
| @@ -37,18 +35,13 @@ fn bench_songs(c: &mut criterion::Criterion) { | |||||||
|         .queries |         .queries | ||||||
|         .iter() |         .iter() | ||||||
|         .map(|s| { |         .map(|s| { | ||||||
|             s.trim() |             s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ") | ||||||
|                 .split(' ') |  | ||||||
|                 .map(|s| format!(r#""{}""#, s)) |  | ||||||
|                 .collect::<Vec<String>>() |  | ||||||
|                 .join(" ") |  | ||||||
|         }) |         }) | ||||||
|         .collect(); |         .collect(); | ||||||
|     let basic_with_quote: &[&str] = &basic_with_quote |     let basic_with_quote: &[&str] = | ||||||
|         .iter() |         &basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>(); | ||||||
|         .map(|s| s.as_str()) |  | ||||||
|         .collect::<Vec<&str>>(); |  | ||||||
|  |  | ||||||
|  |     #[rustfmt::skip] | ||||||
|     let confs = &[ |     let confs = &[ | ||||||
|         /* first we bench each criterion alone */ |         /* first we bench each criterion alone */ | ||||||
|         utils::Conf { |         utils::Conf { | ||||||
|   | |||||||
| @@ -1,9 +1,7 @@ | |||||||
|  | use std::fs::File; | ||||||
|  | use std::io::{Cursor, Read, Seek, Write}; | ||||||
| use std::path::{Path, PathBuf}; | use std::path::{Path, PathBuf}; | ||||||
| use std::{env, fs}; | use std::{env, fs}; | ||||||
| use std::{ |  | ||||||
|     fs::File, |  | ||||||
|     io::{Cursor, Read, Seek, Write}, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| use bytes::Bytes; | use bytes::Bytes; | ||||||
| use convert_case::{Case, Casing}; | use convert_case::{Case, Casing}; | ||||||
| @@ -45,7 +43,10 @@ fn main() -> anyhow::Result<()> { | |||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         if out_file.exists() { |         if out_file.exists() { | ||||||
|             eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); |             eprintln!( | ||||||
|  |                 "The dataset {} already exists on the file system and will not be downloaded again", | ||||||
|  |                 dataset | ||||||
|  |             ); | ||||||
|             continue; |             continue; | ||||||
|         } |         } | ||||||
|         let url = format!("{}/{}.csv.gz", BASE_URL, dataset); |         let url = format!("{}/{}.csv.gz", BASE_URL, dataset); | ||||||
| @@ -60,12 +61,8 @@ fn main() -> anyhow::Result<()> { | |||||||
| } | } | ||||||
|  |  | ||||||
| fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> { | fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> { | ||||||
|     let bytes = reqwest::blocking::Client::builder() |     let bytes = | ||||||
|         .timeout(None) |         reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?; | ||||||
|         .build()? |  | ||||||
|         .get(url) |  | ||||||
|         .send()? |  | ||||||
|         .bytes()?; |  | ||||||
|     Ok(Cursor::new(bytes)) |     Ok(Cursor::new(bytes)) | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,9 +1,8 @@ | |||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
|  |  | ||||||
| use byte_unit::Byte; | use byte_unit::Byte; | ||||||
| use heed::{Env, EnvOpenOptions, CompactionOption}; | use heed::{CompactionOption, Env, EnvOpenOptions}; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
|  |  | ||||||
| use Command::*; | use Command::*; | ||||||
|  |  | ||||||
| #[cfg(target_os = "linux")] | #[cfg(target_os = "linux")] | ||||||
| @@ -65,7 +64,7 @@ fn main() -> anyhow::Result<()> { | |||||||
|             use CompactionOption::*; |             use CompactionOption::*; | ||||||
|             let compaction = if enable_compaction { Enabled } else { Disabled }; |             let compaction = if enable_compaction { Enabled } else { Disabled }; | ||||||
|             copy_main_database_to_stdout(env, compaction) |             copy_main_database_to_stdout(env, compaction) | ||||||
|         }, |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,5 @@ | |||||||
| mod update_store; | mod update_store; | ||||||
|  |  | ||||||
| use std::{io, mem}; |  | ||||||
| use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; | use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; | ||||||
| use std::fmt::Display; | use std::fmt::Display; | ||||||
| use std::fs::{create_dir_all, File}; | use std::fs::{create_dir_all, File}; | ||||||
| @@ -10,16 +9,19 @@ use std::path::PathBuf; | |||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
|  | use std::{io, mem}; | ||||||
|  |  | ||||||
| use askama_warp::Template; | use askama_warp::Template; | ||||||
| use byte_unit::Byte; | use byte_unit::Byte; | ||||||
| use either::Either; | use either::Either; | ||||||
| use flate2::read::GzDecoder; | use flate2::read::GzDecoder; | ||||||
| use futures::{FutureExt, StreamExt}; | use futures::{stream, FutureExt, StreamExt}; | ||||||
| use futures::stream; |  | ||||||
| use grenad::CompressionType; | use grenad::CompressionType; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; | use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; | ||||||
|  | use milli::update::UpdateIndexingStep::*; | ||||||
|  | use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; | ||||||
|  | use milli::{obkv_to_json, FilterCondition, Index, MatchingWords, SearchResult}; | ||||||
| use once_cell::sync::OnceCell; | use once_cell::sync::OnceCell; | ||||||
| use rayon::ThreadPool; | use rayon::ThreadPool; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| @@ -28,12 +30,9 @@ use structopt::StructOpt; | |||||||
| use tokio::fs::File as TFile; | use tokio::fs::File as TFile; | ||||||
| use tokio::io::AsyncWriteExt; | use tokio::io::AsyncWriteExt; | ||||||
| use tokio::sync::broadcast; | use tokio::sync::broadcast; | ||||||
| use warp::{Filter, http::Response}; |  | ||||||
| use warp::filters::ws::Message; | use warp::filters::ws::Message; | ||||||
|  | use warp::http::Response; | ||||||
| use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult}; | use warp::Filter; | ||||||
| use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; |  | ||||||
| use milli::update::UpdateIndexingStep::*; |  | ||||||
|  |  | ||||||
| use self::update_store::UpdateStore; | use self::update_store::UpdateStore; | ||||||
|  |  | ||||||
| @@ -149,25 +148,28 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { | |||||||
|                 for (word, token) in analyzed.reconstruct() { |                 for (word, token) in analyzed.reconstruct() { | ||||||
|                     if token.is_word() { |                     if token.is_word() { | ||||||
|                         let to_highlight = matching_words.matching_bytes(token.text()).is_some(); |                         let to_highlight = matching_words.matching_bytes(token.text()).is_some(); | ||||||
|                         if to_highlight { string.push_str("<mark>") } |                         if to_highlight { | ||||||
|  |                             string.push_str("<mark>") | ||||||
|  |                         } | ||||||
|                         string.push_str(word); |                         string.push_str(word); | ||||||
|                         if to_highlight { string.push_str("</mark>") } |                         if to_highlight { | ||||||
|  |                             string.push_str("</mark>") | ||||||
|  |                         } | ||||||
|                     } else { |                     } else { | ||||||
|                         string.push_str(word); |                         string.push_str(word); | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Value::String(string) |                 Value::String(string) | ||||||
|             } |             } | ||||||
|             Value::Array(values) => { |             Value::Array(values) => Value::Array( | ||||||
|                 Value::Array(values.into_iter() |                 values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(), | ||||||
|                     .map(|v| self.highlight_value(v, matching_words)) |             ), | ||||||
|                     .collect()) |             Value::Object(object) => Value::Object( | ||||||
|             } |                 object | ||||||
|             Value::Object(object) => { |                     .into_iter() | ||||||
|                 Value::Object(object.into_iter() |  | ||||||
|                     .map(|(k, v)| (k, self.highlight_value(v, matching_words))) |                     .map(|(k, v)| (k, self.highlight_value(v, matching_words))) | ||||||
|                     .collect()) |                     .collect(), | ||||||
|             } |             ), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -236,12 +238,7 @@ enum UpdateMeta { | |||||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | #[derive(Debug, Clone, Serialize, Deserialize)] | ||||||
| #[serde(tag = "type")] | #[serde(tag = "type")] | ||||||
| enum UpdateMetaProgress { | enum UpdateMetaProgress { | ||||||
|     DocumentsAddition { |     DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option<usize> }, | ||||||
|         step: usize, |  | ||||||
|         total_steps: usize, |  | ||||||
|         current: usize, |  | ||||||
|         total: Option<usize>, |  | ||||||
|     }, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] | #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] | ||||||
| @@ -342,157 +339,185 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); |             update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); | ||||||
|             update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size); |             update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size); | ||||||
|             update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); |             update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); | ||||||
|             update_builder.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); |             update_builder | ||||||
|  |                 .chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); | ||||||
|  |  | ||||||
|             let before_update = Instant::now(); |             let before_update = Instant::now(); | ||||||
|             // we extract the update type and execute the update itself. |             // we extract the update type and execute the update itself. | ||||||
|             let result: anyhow::Result<()> = match meta { |             let result: anyhow::Result<()> = | ||||||
|                 UpdateMeta::DocumentsAddition { method, format, encoding } => { |                 match meta { | ||||||
|                     // We must use the write transaction of the update here. |                     UpdateMeta::DocumentsAddition { method, format, encoding } => { | ||||||
|                     let mut wtxn = index_cloned.write_txn()?; |                         // We must use the write transaction of the update here. | ||||||
|                     let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); |                         let mut wtxn = index_cloned.write_txn()?; | ||||||
|  |                         let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); | ||||||
|  |  | ||||||
|                     match format.as_str() { |                         match format.as_str() { | ||||||
|                         "csv" => builder.update_format(UpdateFormat::Csv), |                             "csv" => builder.update_format(UpdateFormat::Csv), | ||||||
|                         "json" => builder.update_format(UpdateFormat::Json), |                             "json" => builder.update_format(UpdateFormat::Json), | ||||||
|                         "json-stream" => builder.update_format(UpdateFormat::JsonStream), |                             "json-stream" => builder.update_format(UpdateFormat::JsonStream), | ||||||
|                         otherwise => panic!("invalid update format {:?}", otherwise), |                             otherwise => panic!("invalid update format {:?}", otherwise), | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     match method.as_str() { |  | ||||||
|                         "replace" => builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments), |  | ||||||
|                         "update" => builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments), |  | ||||||
|                         otherwise => panic!("invalid indexing method {:?}", otherwise), |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     let reader = match encoding.as_deref() { |  | ||||||
|                         Some("gzip") => Box::new(GzDecoder::new(content)), |  | ||||||
|                         None => Box::new(content) as Box<dyn io::Read>, |  | ||||||
|                         otherwise => panic!("invalid encoding format {:?}", otherwise), |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     let result = builder.execute(reader, |indexing_step, update_id| { |  | ||||||
|                         let (current, total) = match indexing_step { |  | ||||||
|                             TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), |  | ||||||
|                             ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), |  | ||||||
|                             IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), |  | ||||||
|                             MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), |  | ||||||
|                         }; |                         }; | ||||||
|                         let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { |  | ||||||
|                             update_id, |  | ||||||
|                             meta: UpdateMetaProgress::DocumentsAddition { |  | ||||||
|                                 step: indexing_step.step(), |  | ||||||
|                                 total_steps: indexing_step.number_of_steps(), |  | ||||||
|                                 current, |  | ||||||
|                                 total, |  | ||||||
|                             }, |  | ||||||
|                         }); |  | ||||||
|                     }); |  | ||||||
|  |  | ||||||
|                     match result { |                         match method.as_str() { | ||||||
|                         Ok(_) => wtxn.commit().map_err(Into::into), |                             "replace" => builder | ||||||
|                         Err(e) => Err(e.into()), |                                 .index_documents_method(IndexDocumentsMethod::ReplaceDocuments), | ||||||
|                     } |                             "update" => builder | ||||||
|                 } |                                 .index_documents_method(IndexDocumentsMethod::UpdateDocuments), | ||||||
|                 UpdateMeta::ClearDocuments => { |                             otherwise => panic!("invalid indexing method {:?}", otherwise), | ||||||
|                     // We must use the write transaction of the update here. |  | ||||||
|                     let mut wtxn = index_cloned.write_txn()?; |  | ||||||
|                     let builder = update_builder.clear_documents(&mut wtxn, &index_cloned); |  | ||||||
|  |  | ||||||
|                     match builder.execute() { |  | ||||||
|                         Ok(_count) => wtxn.commit().map_err(Into::into), |  | ||||||
|                         Err(e) => Err(e.into()), |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 UpdateMeta::Settings(settings) => { |  | ||||||
|                     // We must use the write transaction of the update here. |  | ||||||
|                     let mut wtxn = index_cloned.write_txn()?; |  | ||||||
|                     let mut builder = update_builder.settings(&mut wtxn, &index_cloned); |  | ||||||
|  |  | ||||||
|                     // We transpose the settings JSON struct into a real setting update. |  | ||||||
|                     match settings.searchable_attributes { |  | ||||||
|                         Setting::Set(searchable_attributes) => builder.set_searchable_fields(searchable_attributes), |  | ||||||
|                         Setting::Reset => builder.reset_searchable_fields(), |  | ||||||
|                         Setting::NotSet => () |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // We transpose the settings JSON struct into a real setting update. |  | ||||||
|                     match settings.displayed_attributes { |  | ||||||
|                         Setting::Set(displayed_attributes) => builder.set_displayed_fields(displayed_attributes), |  | ||||||
|                         Setting::Reset => builder.reset_displayed_fields(), |  | ||||||
|                         Setting::NotSet => () |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // We transpose the settings JSON struct into a real setting update. |  | ||||||
|                     match settings.filterable_attributes { |  | ||||||
|                         Setting::Set(filterable_attributes) => builder.set_filterable_fields(filterable_attributes), |  | ||||||
|                         Setting::Reset => builder.reset_filterable_fields(), |  | ||||||
|                         Setting::NotSet => () |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // We transpose the settings JSON struct into a real setting update. |  | ||||||
|                     match settings.criteria { |  | ||||||
|                         Setting::Set(criteria) => builder.set_criteria(criteria), |  | ||||||
|                         Setting::Reset => builder.reset_criteria(), |  | ||||||
|                         Setting::NotSet => () |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // We transpose the settings JSON struct into a real setting update. |  | ||||||
|                     match settings.stop_words { |  | ||||||
|                         Setting::Set(stop_words) => builder.set_stop_words(stop_words), |  | ||||||
|                         Setting::Reset => builder.reset_stop_words(), |  | ||||||
|                         Setting::NotSet => () |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // We transpose the settings JSON struct into a real setting update. |  | ||||||
|                     match settings.synonyms { |  | ||||||
|                         Setting::Set(synonyms) => builder.set_synonyms(synonyms), |  | ||||||
|                         Setting::Reset => builder.reset_synonyms(), |  | ||||||
|                         Setting::NotSet => () |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     let result = builder.execute(|indexing_step, update_id| { |  | ||||||
|                         let (current, total) = match indexing_step { |  | ||||||
|                             TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), |  | ||||||
|                             ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), |  | ||||||
|                             IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), |  | ||||||
|                             MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), |  | ||||||
|                         }; |                         }; | ||||||
|                         let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { |  | ||||||
|                             update_id, |  | ||||||
|                             meta: UpdateMetaProgress::DocumentsAddition { |  | ||||||
|                                 step: indexing_step.step(), |  | ||||||
|                                 total_steps: indexing_step.number_of_steps(), |  | ||||||
|                                 current, |  | ||||||
|                                 total, |  | ||||||
|                             }, |  | ||||||
|                         }); |  | ||||||
|                     }); |  | ||||||
|  |  | ||||||
|                     match result { |                         let reader = match encoding.as_deref() { | ||||||
|                         Ok(_count) => wtxn.commit().map_err(Into::into), |                             Some("gzip") => Box::new(GzDecoder::new(content)), | ||||||
|                         Err(e) => Err(e.into()), |                             None => Box::new(content) as Box<dyn io::Read>, | ||||||
|  |                             otherwise => panic!("invalid encoding format {:?}", otherwise), | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|  |                         let result = builder.execute(reader, |indexing_step, update_id| { | ||||||
|  |                             let (current, total) = match indexing_step { | ||||||
|  |                                 TransformFromUserIntoGenericFormat { documents_seen } => { | ||||||
|  |                                     (documents_seen, None) | ||||||
|  |                                 } | ||||||
|  |                                 ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { | ||||||
|  |                                     (documents_seen, Some(total_documents)) | ||||||
|  |                                 } | ||||||
|  |                                 IndexDocuments { documents_seen, total_documents } => { | ||||||
|  |                                     (documents_seen, Some(total_documents)) | ||||||
|  |                                 } | ||||||
|  |                                 MergeDataIntoFinalDatabase { databases_seen, total_databases } => { | ||||||
|  |                                     (databases_seen, Some(total_databases)) | ||||||
|  |                                 } | ||||||
|  |                             }; | ||||||
|  |                             let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { | ||||||
|  |                                 update_id, | ||||||
|  |                                 meta: UpdateMetaProgress::DocumentsAddition { | ||||||
|  |                                     step: indexing_step.step(), | ||||||
|  |                                     total_steps: indexing_step.number_of_steps(), | ||||||
|  |                                     current, | ||||||
|  |                                     total, | ||||||
|  |                                 }, | ||||||
|  |                             }); | ||||||
|  |                         }); | ||||||
|  |  | ||||||
|  |                         match result { | ||||||
|  |                             Ok(_) => wtxn.commit().map_err(Into::into), | ||||||
|  |                             Err(e) => Err(e.into()), | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                     UpdateMeta::ClearDocuments => { | ||||||
|                 UpdateMeta::Facets(levels) => { |                         // We must use the write transaction of the update here. | ||||||
|                     // We must use the write transaction of the update here. |                         let mut wtxn = index_cloned.write_txn()?; | ||||||
|                     let mut wtxn = index_cloned.write_txn()?; |                         let builder = update_builder.clear_documents(&mut wtxn, &index_cloned); | ||||||
|                     let mut builder = update_builder.facets(&mut wtxn, &index_cloned); |  | ||||||
|                     if let Some(value) = levels.level_group_size { |                         match builder.execute() { | ||||||
|                         builder.level_group_size(value); |                             Ok(_count) => wtxn.commit().map_err(Into::into), | ||||||
|  |                             Err(e) => Err(e.into()), | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                     if let Some(value) = levels.min_level_size { |                     UpdateMeta::Settings(settings) => { | ||||||
|                         builder.min_level_size(value); |                         // We must use the write transaction of the update here. | ||||||
|  |                         let mut wtxn = index_cloned.write_txn()?; | ||||||
|  |                         let mut builder = update_builder.settings(&mut wtxn, &index_cloned); | ||||||
|  |  | ||||||
|  |                         // We transpose the settings JSON struct into a real setting update. | ||||||
|  |                         match settings.searchable_attributes { | ||||||
|  |                             Setting::Set(searchable_attributes) => { | ||||||
|  |                                 builder.set_searchable_fields(searchable_attributes) | ||||||
|  |                             } | ||||||
|  |                             Setting::Reset => builder.reset_searchable_fields(), | ||||||
|  |                             Setting::NotSet => (), | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // We transpose the settings JSON struct into a real setting update. | ||||||
|  |                         match settings.displayed_attributes { | ||||||
|  |                             Setting::Set(displayed_attributes) => { | ||||||
|  |                                 builder.set_displayed_fields(displayed_attributes) | ||||||
|  |                             } | ||||||
|  |                             Setting::Reset => builder.reset_displayed_fields(), | ||||||
|  |                             Setting::NotSet => (), | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // We transpose the settings JSON struct into a real setting update. | ||||||
|  |                         match settings.filterable_attributes { | ||||||
|  |                             Setting::Set(filterable_attributes) => { | ||||||
|  |                                 builder.set_filterable_fields(filterable_attributes) | ||||||
|  |                             } | ||||||
|  |                             Setting::Reset => builder.reset_filterable_fields(), | ||||||
|  |                             Setting::NotSet => (), | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // We transpose the settings JSON struct into a real setting update. | ||||||
|  |                         match settings.criteria { | ||||||
|  |                             Setting::Set(criteria) => builder.set_criteria(criteria), | ||||||
|  |                             Setting::Reset => builder.reset_criteria(), | ||||||
|  |                             Setting::NotSet => (), | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // We transpose the settings JSON struct into a real setting update. | ||||||
|  |                         match settings.stop_words { | ||||||
|  |                             Setting::Set(stop_words) => builder.set_stop_words(stop_words), | ||||||
|  |                             Setting::Reset => builder.reset_stop_words(), | ||||||
|  |                             Setting::NotSet => (), | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // We transpose the settings JSON struct into a real setting update. | ||||||
|  |                         match settings.synonyms { | ||||||
|  |                             Setting::Set(synonyms) => builder.set_synonyms(synonyms), | ||||||
|  |                             Setting::Reset => builder.reset_synonyms(), | ||||||
|  |                             Setting::NotSet => (), | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         let result = builder.execute(|indexing_step, update_id| { | ||||||
|  |                             let (current, total) = match indexing_step { | ||||||
|  |                                 TransformFromUserIntoGenericFormat { documents_seen } => { | ||||||
|  |                                     (documents_seen, None) | ||||||
|  |                                 } | ||||||
|  |                                 ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { | ||||||
|  |                                     (documents_seen, Some(total_documents)) | ||||||
|  |                                 } | ||||||
|  |                                 IndexDocuments { documents_seen, total_documents } => { | ||||||
|  |                                     (documents_seen, Some(total_documents)) | ||||||
|  |                                 } | ||||||
|  |                                 MergeDataIntoFinalDatabase { databases_seen, total_databases } => { | ||||||
|  |                                     (databases_seen, Some(total_databases)) | ||||||
|  |                                 } | ||||||
|  |                             }; | ||||||
|  |                             let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { | ||||||
|  |                                 update_id, | ||||||
|  |                                 meta: UpdateMetaProgress::DocumentsAddition { | ||||||
|  |                                     step: indexing_step.step(), | ||||||
|  |                                     total_steps: indexing_step.number_of_steps(), | ||||||
|  |                                     current, | ||||||
|  |                                     total, | ||||||
|  |                                 }, | ||||||
|  |                             }); | ||||||
|  |                         }); | ||||||
|  |  | ||||||
|  |                         match result { | ||||||
|  |                             Ok(_count) => wtxn.commit().map_err(Into::into), | ||||||
|  |                             Err(e) => Err(e.into()), | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                     match builder.execute() { |                     UpdateMeta::Facets(levels) => { | ||||||
|                         Ok(()) => wtxn.commit().map_err(Into::into), |                         // We must use the write transaction of the update here. | ||||||
|                         Err(e) => Err(e.into()), |                         let mut wtxn = index_cloned.write_txn()?; | ||||||
|  |                         let mut builder = update_builder.facets(&mut wtxn, &index_cloned); | ||||||
|  |                         if let Some(value) = levels.level_group_size { | ||||||
|  |                             builder.level_group_size(value); | ||||||
|  |                         } | ||||||
|  |                         if let Some(value) = levels.min_level_size { | ||||||
|  |                             builder.min_level_size(value); | ||||||
|  |                         } | ||||||
|  |                         match builder.execute() { | ||||||
|  |                             Ok(()) => wtxn.commit().map_err(Into::into), | ||||||
|  |                             Err(e) => Err(e.into()), | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 }; | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             let meta = match result { |             let meta = match result { | ||||||
|                 Ok(()) => format!("valid update content processed in {:.02?}", before_update.elapsed()), |                 Ok(()) => { | ||||||
|  |                     format!("valid update content processed in {:.02?}", before_update.elapsed()) | ||||||
|  |                 } | ||||||
|                 Err(e) => format!("error while processing update content: {:?}", e), |                 Err(e) => format!("error while processing update content: {:?}", e), | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
| @@ -500,7 +525,8 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             let _ = update_status_sender_cloned.send(processed); |             let _ = update_status_sender_cloned.send(processed); | ||||||
|  |  | ||||||
|             Ok(meta) |             Ok(meta) | ||||||
|         })?; |         }, | ||||||
|  |     )?; | ||||||
|  |  | ||||||
|     // The database name will not change. |     // The database name will not change. | ||||||
|     let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); |     let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); | ||||||
| @@ -512,15 +538,11 @@ async fn main() -> anyhow::Result<()> { | |||||||
|     let db_name_cloned = db_name.clone(); |     let db_name_cloned = db_name.clone(); | ||||||
|     let lmdb_path_cloned = lmdb_path.clone(); |     let lmdb_path_cloned = lmdb_path.clone(); | ||||||
|     let index_cloned = index.clone(); |     let index_cloned = index.clone(); | ||||||
|     let dash_html_route = warp::filters::method::get() |     let dash_html_route = | ||||||
|         .and(warp::filters::path::end()) |         warp::filters::method::get().and(warp::filters::path::end()).map(move || { | ||||||
|         .map(move || { |  | ||||||
|             // We retrieve the database size. |             // We retrieve the database size. | ||||||
|             let db_size = File::open(lmdb_path_cloned.clone()) |             let db_size = | ||||||
|                 .unwrap() |                 File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize; | ||||||
|                 .metadata() |  | ||||||
|                 .unwrap() |  | ||||||
|                 .len() as usize; |  | ||||||
|  |  | ||||||
|             // And the number of documents in the database. |             // And the number of documents in the database. | ||||||
|             let rtxn = index_cloned.read_txn().unwrap(); |             let rtxn = index_cloned.read_txn().unwrap(); | ||||||
| @@ -537,111 +559,105 @@ async fn main() -> anyhow::Result<()> { | |||||||
|         .and(warp::path!("updates")) |         .and(warp::path!("updates")) | ||||||
|         .map(move |header: String| { |         .map(move |header: String| { | ||||||
|             let update_store = update_store_cloned.clone(); |             let update_store = update_store_cloned.clone(); | ||||||
|             let mut updates = update_store.iter_metas(|processed, aborted, pending| { |             let mut updates = update_store | ||||||
|                 let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new(); |                 .iter_metas(|processed, aborted, pending| { | ||||||
|                 for result in processed { |                     let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new(); | ||||||
|                     let (uid, meta) = result?; |                     for result in processed { | ||||||
|                     updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); |                         let (uid, meta) = result?; | ||||||
|                 } |                         updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); | ||||||
|                 for result in aborted { |                     } | ||||||
|                     let (uid, meta) = result?; |                     for result in aborted { | ||||||
|                     updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta }); |                         let (uid, meta) = result?; | ||||||
|                 } |                         updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta }); | ||||||
|                 for result in pending { |                     } | ||||||
|                     let (uid, meta) = result?; |                     for result in pending { | ||||||
|                     updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); |                         let (uid, meta) = result?; | ||||||
|                 } |                         updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); | ||||||
|                 Ok(updates) |                     } | ||||||
|             }).unwrap(); |                     Ok(updates) | ||||||
|  |                 }) | ||||||
|  |                 .unwrap(); | ||||||
|  |  | ||||||
|             updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse()); |             updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse()); | ||||||
|  |  | ||||||
|             if header.contains("text/html") { |             if header.contains("text/html") { | ||||||
|                 // We retrieve the database size. |                 // We retrieve the database size. | ||||||
|                 let db_size = File::open(lmdb_path_cloned.clone()) |                 let db_size = | ||||||
|                     .unwrap() |                     File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() | ||||||
|                     .metadata() |                         as usize; | ||||||
|                     .unwrap() |  | ||||||
|                     .len() as usize; |  | ||||||
|  |  | ||||||
|                 // And the number of documents in the database. |                 // And the number of documents in the database. | ||||||
|                 let rtxn = index_cloned.read_txn().unwrap(); |                 let rtxn = index_cloned.read_txn().unwrap(); | ||||||
|                 let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize; |                 let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize; | ||||||
|  |  | ||||||
|                 let template = UpdatesTemplate { |                 let template = | ||||||
|                     db_name: db_name.clone(), |                     UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates }; | ||||||
|                     db_size, |  | ||||||
|                     docs_count, |  | ||||||
|                     updates, |  | ||||||
|                 }; |  | ||||||
|                 Box::new(template) as Box<dyn warp::Reply> |                 Box::new(template) as Box<dyn warp::Reply> | ||||||
|             } else { |             } else { | ||||||
|                 Box::new(warp::reply::json(&updates)) |                 Box::new(warp::reply::json(&updates)) | ||||||
|             } |             } | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|     let dash_bulma_route = warp::filters::method::get() |     let dash_bulma_route = | ||||||
|         .and(warp::path!("bulma.min.css")) |         warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| { | ||||||
|         .map(|| Response::builder() |             Response::builder() | ||||||
|             .header("content-type", "text/css; charset=utf-8") |                 .header("content-type", "text/css; charset=utf-8") | ||||||
|             .body(include_str!("../public/bulma.min.css")) |                 .body(include_str!("../public/bulma.min.css")) | ||||||
|         ); |         }); | ||||||
|  |  | ||||||
|     let dash_bulma_dark_route = warp::filters::method::get() |     let dash_bulma_dark_route = | ||||||
|         .and(warp::path!("bulma-prefers-dark.min.css")) |         warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| { | ||||||
|         .map(|| Response::builder() |             Response::builder() | ||||||
|             .header("content-type", "text/css; charset=utf-8") |                 .header("content-type", "text/css; charset=utf-8") | ||||||
|             .body(include_str!("../public/bulma-prefers-dark.min.css")) |                 .body(include_str!("../public/bulma-prefers-dark.min.css")) | ||||||
|         ); |         }); | ||||||
|  |  | ||||||
|     let dash_style_route = warp::filters::method::get() |     let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| { | ||||||
|         .and(warp::path!("style.css")) |         Response::builder() | ||||||
|         .map(|| Response::builder() |  | ||||||
|             .header("content-type", "text/css; charset=utf-8") |             .header("content-type", "text/css; charset=utf-8") | ||||||
|             .body(include_str!("../public/style.css")) |             .body(include_str!("../public/style.css")) | ||||||
|         ); |     }); | ||||||
|  |  | ||||||
|     let dash_jquery_route = warp::filters::method::get() |     let dash_jquery_route = | ||||||
|         .and(warp::path!("jquery-3.4.1.min.js")) |         warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| { | ||||||
|         .map(|| Response::builder() |             Response::builder() | ||||||
|             .header("content-type", "application/javascript; charset=utf-8") |                 .header("content-type", "application/javascript; charset=utf-8") | ||||||
|             .body(include_str!("../public/jquery-3.4.1.min.js")) |                 .body(include_str!("../public/jquery-3.4.1.min.js")) | ||||||
|         ); |         }); | ||||||
|  |  | ||||||
|     let dash_filesize_route = warp::filters::method::get() |     let dash_filesize_route = | ||||||
|         .and(warp::path!("filesize.min.js")) |         warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| { | ||||||
|         .map(|| Response::builder() |             Response::builder() | ||||||
|             .header("content-type", "application/javascript; charset=utf-8") |                 .header("content-type", "application/javascript; charset=utf-8") | ||||||
|             .body(include_str!("../public/filesize.min.js")) |                 .body(include_str!("../public/filesize.min.js")) | ||||||
|         ); |         }); | ||||||
|  |  | ||||||
|     let dash_script_route = warp::filters::method::get() |     let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| { | ||||||
|         .and(warp::path!("script.js")) |         Response::builder() | ||||||
|         .map(|| Response::builder() |  | ||||||
|             .header("content-type", "application/javascript; charset=utf-8") |             .header("content-type", "application/javascript; charset=utf-8") | ||||||
|             .body(include_str!("../public/script.js")) |             .body(include_str!("../public/script.js")) | ||||||
|         ); |     }); | ||||||
|  |  | ||||||
|     let updates_script_route = warp::filters::method::get() |     let updates_script_route = | ||||||
|         .and(warp::path!("updates-script.js")) |         warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| { | ||||||
|         .map(|| Response::builder() |             Response::builder() | ||||||
|             .header("content-type", "application/javascript; charset=utf-8") |                 .header("content-type", "application/javascript; charset=utf-8") | ||||||
|             .body(include_str!("../public/updates-script.js")) |                 .body(include_str!("../public/updates-script.js")) | ||||||
|         ); |         }); | ||||||
|  |  | ||||||
|     let dash_logo_white_route = warp::filters::method::get() |     let dash_logo_white_route = | ||||||
|         .and(warp::path!("logo-white.svg")) |         warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| { | ||||||
|         .map(|| Response::builder() |             Response::builder() | ||||||
|             .header("content-type", "image/svg+xml") |                 .header("content-type", "image/svg+xml") | ||||||
|             .body(include_str!("../public/logo-white.svg")) |                 .body(include_str!("../public/logo-white.svg")) | ||||||
|         ); |         }); | ||||||
|  |  | ||||||
|     let dash_logo_black_route = warp::filters::method::get() |     let dash_logo_black_route = | ||||||
|         .and(warp::path!("logo-black.svg")) |         warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| { | ||||||
|         .map(|| Response::builder() |             Response::builder() | ||||||
|             .header("content-type", "image/svg+xml") |                 .header("content-type", "image/svg+xml") | ||||||
|             .body(include_str!("../public/logo-black.svg")) |                 .body(include_str!("../public/logo-black.svg")) | ||||||
|         ); |         }); | ||||||
|  |  | ||||||
|     #[derive(Debug, Deserialize)] |     #[derive(Debug, Deserialize)] | ||||||
|     #[serde(untagged)] |     #[serde(untagged)] | ||||||
| @@ -719,7 +735,8 @@ async fn main() -> anyhow::Result<()> { | |||||||
|                 search.filter(condition); |                 search.filter(condition); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); |             let SearchResult { matching_words, candidates, documents_ids } = | ||||||
|  |                 search.execute().unwrap(); | ||||||
|  |  | ||||||
|             let number_of_candidates = candidates.len(); |             let number_of_candidates = candidates.len(); | ||||||
|             let facets = if query.facet_distribution == Some(true) { |             let facets = if query.facet_distribution == Some(true) { | ||||||
| @@ -745,17 +762,18 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { |             for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { | ||||||
|                 let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); |                 let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); | ||||||
|                 if !disable_highlighting { |                 if !disable_highlighting { | ||||||
|                     highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); |                     highlighter.highlight_record( | ||||||
|  |                         &mut object, | ||||||
|  |                         &matching_words, | ||||||
|  |                         &attributes_to_highlight, | ||||||
|  |                     ); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 documents.push(object); |                 documents.push(object); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let answer = Answer { |             let answer = | ||||||
|                 documents, |                 Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() }; | ||||||
|                 number_of_candidates, |  | ||||||
|                 facets: facets.unwrap_or_default(), |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             Response::builder() |             Response::builder() | ||||||
|                 .header("Content-Type", "application/json") |                 .header("Content-Type", "application/json") | ||||||
| @@ -764,9 +782,8 @@ async fn main() -> anyhow::Result<()> { | |||||||
|         }); |         }); | ||||||
|  |  | ||||||
|     let index_cloned = index.clone(); |     let index_cloned = index.clone(); | ||||||
|     let document_route = warp::filters::method::get() |     let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map( | ||||||
|         .and(warp::path!("document" / String)) |         move |id: String| { | ||||||
|         .map(move |id: String| { |  | ||||||
|             let index = index_cloned.clone(); |             let index = index_cloned.clone(); | ||||||
|             let rtxn = index.read_txn().unwrap(); |             let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
| @@ -780,30 +797,31 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             match external_documents_ids.get(&id) { |             match external_documents_ids.get(&id) { | ||||||
|                 Some(document_id) => { |                 Some(document_id) => { | ||||||
|                     let document_id = document_id as u32; |                     let document_id = document_id as u32; | ||||||
|                     let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); |                     let (_, obkv) = | ||||||
|  |                         index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); | ||||||
|                     let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); |                     let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); | ||||||
|  |  | ||||||
|                     Response::builder() |                     Response::builder() | ||||||
|                         .header("Content-Type", "application/json") |                         .header("Content-Type", "application/json") | ||||||
|                         .body(serde_json::to_string(&document).unwrap()) |                         .body(serde_json::to_string(&document).unwrap()) | ||||||
|                 } |                 } | ||||||
|                 None => { |                 None => Response::builder() | ||||||
|                     Response::builder() |                     .status(404) | ||||||
|                         .status(404) |                     .body(format!("Document with id {:?} not found.", id)), | ||||||
|                         .body(format!("Document with id {:?} not found.", id)) |  | ||||||
|                 } |  | ||||||
|             } |             } | ||||||
|         }); |         }, | ||||||
|  |     ); | ||||||
|  |  | ||||||
|     async fn buf_stream( |     async fn buf_stream( | ||||||
|         update_store: Arc<UpdateStore<UpdateMeta, String>>, |         update_store: Arc<UpdateStore<UpdateMeta, String>>, | ||||||
|         update_status_sender: broadcast::Sender<UpdateStatus<UpdateMeta, UpdateMetaProgress, String>>, |         update_status_sender: broadcast::Sender< | ||||||
|  |             UpdateStatus<UpdateMeta, UpdateMetaProgress, String>, | ||||||
|  |         >, | ||||||
|         update_method: Option<String>, |         update_method: Option<String>, | ||||||
|         update_format: UpdateFormat, |         update_format: UpdateFormat, | ||||||
|         encoding: Option<String>, |         encoding: Option<String>, | ||||||
|         mut stream: impl futures::Stream<Item=Result<impl bytes::Buf, warp::Error>> + Unpin, |         mut stream: impl futures::Stream<Item = Result<impl bytes::Buf, warp::Error>> + Unpin, | ||||||
|     ) -> Result<impl warp::Reply, warp::Rejection> |     ) -> Result<impl warp::Reply, warp::Rejection> { | ||||||
|     { |  | ||||||
|         let file = tokio::task::block_in_place(tempfile::tempfile).unwrap(); |         let file = tokio::task::block_in_place(tempfile::tempfile).unwrap(); | ||||||
|         let mut file = TFile::from_std(file); |         let mut file = TFile::from_std(file); | ||||||
|  |  | ||||||
| @@ -869,9 +887,8 @@ async fn main() -> anyhow::Result<()> { | |||||||
|  |  | ||||||
|     let update_store_cloned = update_store.clone(); |     let update_store_cloned = update_store.clone(); | ||||||
|     let update_status_sender_cloned = update_status_sender.clone(); |     let update_status_sender_cloned = update_status_sender.clone(); | ||||||
|     let clearing_route = warp::filters::method::post() |     let clearing_route = | ||||||
|         .and(warp::path!("clear-documents")) |         warp::filters::method::post().and(warp::path!("clear-documents")).map(move || { | ||||||
|         .map(move || { |  | ||||||
|             let meta = UpdateMeta::ClearDocuments; |             let meta = UpdateMeta::ClearDocuments; | ||||||
|             let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); |             let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); | ||||||
|             let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); |             let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); | ||||||
| @@ -919,9 +936,8 @@ async fn main() -> anyhow::Result<()> { | |||||||
|  |  | ||||||
|     let update_store_cloned = update_store.clone(); |     let update_store_cloned = update_store.clone(); | ||||||
|     let update_status_sender_cloned = update_status_sender.clone(); |     let update_status_sender_cloned = update_status_sender.clone(); | ||||||
|     let abort_pending_updates_route = warp::filters::method::delete() |     let abort_pending_updates_route = | ||||||
|         .and(warp::path!("updates")) |         warp::filters::method::delete().and(warp::path!("updates")).map(move || { | ||||||
|         .map(move || { |  | ||||||
|             let updates = update_store_cloned.abort_pendings().unwrap(); |             let updates = update_store_cloned.abort_pendings().unwrap(); | ||||||
|             for (update_id, meta) in updates { |             for (update_id, meta) in updates { | ||||||
|                 let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta }); |                 let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta }); | ||||||
| @@ -930,25 +946,22 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             warp::reply() |             warp::reply() | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|     let update_ws_route = warp::ws() |     let update_ws_route = | ||||||
|         .and(warp::path!("updates" / "ws")) |         warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| { | ||||||
|         .map(move |ws: warp::ws::Ws| { |  | ||||||
|             // And then our closure will be called when it completes... |             // And then our closure will be called when it completes... | ||||||
|             let update_status_receiver = update_status_sender.subscribe(); |             let update_status_receiver = update_status_sender.subscribe(); | ||||||
|             ws.on_upgrade(|websocket| { |             ws.on_upgrade(|websocket| { | ||||||
|                 // Just echo all updates messages... |                 // Just echo all updates messages... | ||||||
|                 update_status_receiver |                 update_status_receiver | ||||||
|                     .into_stream() |                     .into_stream() | ||||||
|                     .flat_map(|result| { |                     .flat_map(|result| match result { | ||||||
|                         match result { |                         Ok(status) => { | ||||||
|                             Ok(status) => { |                             let msg = serde_json::to_string(&status).unwrap(); | ||||||
|                                 let msg = serde_json::to_string(&status).unwrap(); |                             stream::iter(Some(Ok(Message::text(msg)))) | ||||||
|                                 stream::iter(Some(Ok(Message::text(msg)))) |                         } | ||||||
|                             } |                         Err(e) => { | ||||||
|                             Err(e) => { |                             eprintln!("channel error: {:?}", e); | ||||||
|                                 eprintln!("channel error: {:?}", e); |                             stream::iter(None) | ||||||
|                                 stream::iter(None) |  | ||||||
|                             } |  | ||||||
|                         } |                         } | ||||||
|                     }) |                     }) | ||||||
|                     .forward(websocket) |                     .forward(websocket) | ||||||
| @@ -988,10 +1001,9 @@ async fn main() -> anyhow::Result<()> { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use maplit::{btreeset,hashmap, hashset}; |     use maplit::{btreeset, hashmap, hashset}; | ||||||
|     use serde_test::{assert_tokens, Token}; |  | ||||||
|  |  | ||||||
|     use milli::update::Setting; |     use milli::update::Setting; | ||||||
|  |     use serde_test::{assert_tokens, Token}; | ||||||
|  |  | ||||||
|     use crate::Settings; |     use crate::Settings; | ||||||
|  |  | ||||||
| @@ -1000,50 +1012,53 @@ mod tests { | |||||||
|         let settings = Settings { |         let settings = Settings { | ||||||
|             displayed_attributes: Setting::Set(vec!["name".to_string()]), |             displayed_attributes: Setting::Set(vec!["name".to_string()]), | ||||||
|             searchable_attributes: Setting::Set(vec!["age".to_string()]), |             searchable_attributes: Setting::Set(vec!["age".to_string()]), | ||||||
|             filterable_attributes: Setting::Set(hashset!{ "age".to_string() }), |             filterable_attributes: Setting::Set(hashset! { "age".to_string() }), | ||||||
|             criteria: Setting::Set(vec!["asc(age)".to_string()]), |             criteria: Setting::Set(vec!["asc(age)".to_string()]), | ||||||
|             stop_words: Setting::Set(btreeset! { "and".to_string() }), |             stop_words: Setting::Set(btreeset! { "and".to_string() }), | ||||||
|             synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] }) |             synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         assert_tokens(&settings, &[ |         assert_tokens( | ||||||
|             Token::Struct { name: "Settings", len: 6 }, |             &settings, | ||||||
|             Token::Str("displayedAttributes"), |             &[ | ||||||
|             Token::Some, |                 Token::Struct { name: "Settings", len: 6 }, | ||||||
|             Token::Seq { len: Some(1) }, |                 Token::Str("displayedAttributes"), | ||||||
|             Token::Str("name"), |                 Token::Some, | ||||||
|             Token::SeqEnd, |                 Token::Seq { len: Some(1) }, | ||||||
|             Token::Str("searchableAttributes"), |                 Token::Str("name"), | ||||||
|             Token::Some, |                 Token::SeqEnd, | ||||||
|             Token::Seq { len: Some(1) }, |                 Token::Str("searchableAttributes"), | ||||||
|             Token::Str("age"), |                 Token::Some, | ||||||
|             Token::SeqEnd, |                 Token::Seq { len: Some(1) }, | ||||||
|             Token::Str("facetedAttributes"), |                 Token::Str("age"), | ||||||
|             Token::Some, |                 Token::SeqEnd, | ||||||
|             Token::Map { len: Some(1) }, |                 Token::Str("facetedAttributes"), | ||||||
|             Token::Str("age"), |                 Token::Some, | ||||||
|             Token::Str("integer"), |                 Token::Map { len: Some(1) }, | ||||||
|             Token::MapEnd, |                 Token::Str("age"), | ||||||
|             Token::Str("criteria"), |                 Token::Str("integer"), | ||||||
|             Token::Some, |                 Token::MapEnd, | ||||||
|             Token::Seq { len: Some(1) }, |                 Token::Str("criteria"), | ||||||
|             Token::Str("asc(age)"), |                 Token::Some, | ||||||
|             Token::SeqEnd, |                 Token::Seq { len: Some(1) }, | ||||||
|             Token::Str("stopWords"), |                 Token::Str("asc(age)"), | ||||||
|             Token::Some, |                 Token::SeqEnd, | ||||||
|             Token::Seq { len: Some(1) }, |                 Token::Str("stopWords"), | ||||||
|             Token::Str("and"), |                 Token::Some, | ||||||
|             Token::SeqEnd, |                 Token::Seq { len: Some(1) }, | ||||||
|             Token::Str("synonyms"), |                 Token::Str("and"), | ||||||
|             Token::Some, |                 Token::SeqEnd, | ||||||
|             Token::Map { len: Some(1) }, |                 Token::Str("synonyms"), | ||||||
|             Token::Str("alex"), |                 Token::Some, | ||||||
|             Token::Seq {len: Some(1) }, |                 Token::Map { len: Some(1) }, | ||||||
|             Token::Str("alexey"), |                 Token::Str("alex"), | ||||||
|             Token::SeqEnd, |                 Token::Seq { len: Some(1) }, | ||||||
|             Token::MapEnd, |                 Token::Str("alexey"), | ||||||
|             Token::StructEnd, |                 Token::SeqEnd, | ||||||
|         ]); |                 Token::MapEnd, | ||||||
|  |                 Token::StructEnd, | ||||||
|  |             ], | ||||||
|  |         ); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
| @@ -1057,22 +1072,25 @@ mod tests { | |||||||
|             synonyms: Setting::Reset, |             synonyms: Setting::Reset, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         assert_tokens(&settings, &[ |         assert_tokens( | ||||||
|             Token::Struct { name: "Settings", len: 6 }, |             &settings, | ||||||
|             Token::Str("displayedAttributes"), |             &[ | ||||||
|             Token::None, |                 Token::Struct { name: "Settings", len: 6 }, | ||||||
|             Token::Str("searchableAttributes"), |                 Token::Str("displayedAttributes"), | ||||||
|             Token::None, |                 Token::None, | ||||||
|             Token::Str("facetedAttributes"), |                 Token::Str("searchableAttributes"), | ||||||
|             Token::None, |                 Token::None, | ||||||
|             Token::Str("criteria"), |                 Token::Str("facetedAttributes"), | ||||||
|             Token::None, |                 Token::None, | ||||||
|             Token::Str("stopWords"), |                 Token::Str("criteria"), | ||||||
|             Token::None, |                 Token::None, | ||||||
|             Token::Str("synonyms"), |                 Token::Str("stopWords"), | ||||||
|             Token::None, |                 Token::None, | ||||||
|             Token::StructEnd, |                 Token::Str("synonyms"), | ||||||
|         ]); |                 Token::None, | ||||||
|  |                 Token::StructEnd, | ||||||
|  |             ], | ||||||
|  |         ); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
| @@ -1086,9 +1104,6 @@ mod tests { | |||||||
|             synonyms: Setting::NotSet, |             synonyms: Setting::NotSet, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         assert_tokens(&settings, &[ |         assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]); | ||||||
|             Token::Struct { name: "Settings", len: 0 }, |  | ||||||
|             Token::StructEnd, |  | ||||||
|         ]); |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -4,9 +4,9 @@ use std::path::Path; | |||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
|  |  | ||||||
| use crossbeam_channel::Sender; | use crossbeam_channel::Sender; | ||||||
| use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice}; | use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; | ||||||
| use heed::{EnvOpenOptions, Env, Database}; | use heed::{Database, Env, EnvOpenOptions}; | ||||||
| use serde::{Serialize, Deserialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; | pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; | ||||||
|  |  | ||||||
| @@ -25,7 +25,9 @@ pub trait UpdateHandler<M, N> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<M, N, F> UpdateHandler<M, N> for F | impl<M, N, F> UpdateHandler<M, N> for F | ||||||
| where F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static { | where | ||||||
|  |     F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static, | ||||||
|  | { | ||||||
|     fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result<N> { |     fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result<N> { | ||||||
|         self(update_id, meta, content) |         self(update_id, meta, content) | ||||||
|     } |     } | ||||||
| @@ -82,26 +84,17 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> { | |||||||
|  |  | ||||||
|     /// Returns the new biggest id to use to store the new update. |     /// Returns the new biggest id to use to store the new update. | ||||||
|     fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result<u64> { |     fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result<u64> { | ||||||
|         let last_pending = self.pending_meta |         let last_pending = | ||||||
|             .remap_data_type::<DecodeIgnore>() |             self.pending_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get()); | ||||||
|             .last(txn)? |  | ||||||
|             .map(|(k, _)| k.get()); |  | ||||||
|  |  | ||||||
|         let last_processed = self.processed_meta |         let last_processed = | ||||||
|             .remap_data_type::<DecodeIgnore>() |             self.processed_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get()); | ||||||
|             .last(txn)? |  | ||||||
|             .map(|(k, _)| k.get()); |  | ||||||
|  |  | ||||||
|         let last_aborted = self.aborted_meta |         let last_aborted = | ||||||
|             .remap_data_type::<DecodeIgnore>() |             self.aborted_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get()); | ||||||
|             .last(txn)? |  | ||||||
|             .map(|(k, _)| k.get()); |  | ||||||
|  |  | ||||||
|         let last_update_id = [last_pending, last_processed, last_aborted] |         let last_update_id = | ||||||
|             .iter() |             [last_pending, last_processed, last_aborted].iter().copied().flatten().max(); | ||||||
|             .copied() |  | ||||||
|             .flatten() |  | ||||||
|             .max(); |  | ||||||
|  |  | ||||||
|         match last_update_id { |         match last_update_id { | ||||||
|             Some(last_id) => Ok(last_id + 1), |             Some(last_id) => Ok(last_id + 1), | ||||||
| @@ -112,7 +105,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> { | |||||||
|     /// Registers the update content in the pending store and the meta |     /// Registers the update content in the pending store and the meta | ||||||
|     /// into the pending-meta store. Returns the new unique update id. |     /// into the pending-meta store. Returns the new unique update id. | ||||||
|     pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result<u64> |     pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result<u64> | ||||||
|     where M: Serialize, |     where | ||||||
|  |         M: Serialize, | ||||||
|     { |     { | ||||||
|         let mut wtxn = self.env.write_txn()?; |         let mut wtxn = self.env.write_txn()?; | ||||||
|  |  | ||||||
| @@ -152,9 +146,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> { | |||||||
|         // a reader while processing it, not a writer. |         // a reader while processing it, not a writer. | ||||||
|         match first_meta { |         match first_meta { | ||||||
|             Some((first_id, first_meta)) => { |             Some((first_id, first_meta)) => { | ||||||
|                 let first_content = self.pending |                 let first_content = | ||||||
|                     .get(&rtxn, &first_id)? |                     self.pending.get(&rtxn, &first_id)?.expect("associated update content"); | ||||||
|                     .expect("associated update content"); |  | ||||||
|  |  | ||||||
|                 // Process the pending update using the provided user function. |                 // Process the pending update using the provided user function. | ||||||
|                 let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?; |                 let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?; | ||||||
| @@ -170,15 +163,16 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> { | |||||||
|                 wtxn.commit()?; |                 wtxn.commit()?; | ||||||
|  |  | ||||||
|                 Ok(Some((first_id.get(), new_meta))) |                 Ok(Some((first_id.get(), new_meta))) | ||||||
|             }, |             } | ||||||
|             None => Ok(None) |             None => Ok(None), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// The id and metadata of the update that is currently being processed, |     /// The id and metadata of the update that is currently being processed, | ||||||
|     /// `None` if no update is being processed. |     /// `None` if no update is being processed. | ||||||
|     pub fn processing_update(&self) -> heed::Result<Option<(u64, M)>> |     pub fn processing_update(&self) -> heed::Result<Option<(u64, M)>> | ||||||
|     where M: for<'a> Deserialize<'a>, |     where | ||||||
|  |         M: for<'a> Deserialize<'a>, | ||||||
|     { |     { | ||||||
|         let rtxn = self.env.read_txn()?; |         let rtxn = self.env.read_txn()?; | ||||||
|         match self.pending_meta.first(&rtxn)? { |         match self.pending_meta.first(&rtxn)? { | ||||||
| @@ -242,7 +236,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> { | |||||||
|     /// that as already been processed or which doesn't actually exist, will |     /// that as already been processed or which doesn't actually exist, will | ||||||
|     /// return `None`. |     /// return `None`. | ||||||
|     pub fn abort_update(&self, update_id: u64) -> heed::Result<Option<M>> |     pub fn abort_update(&self, update_id: u64) -> heed::Result<Option<M>> | ||||||
|     where M: Serialize + for<'a> Deserialize<'a>, |     where | ||||||
|  |         M: Serialize + for<'a> Deserialize<'a>, | ||||||
|     { |     { | ||||||
|         let mut wtxn = self.env.write_txn()?; |         let mut wtxn = self.env.write_txn()?; | ||||||
|         let key = BEU64::new(update_id); |         let key = BEU64::new(update_id); | ||||||
| @@ -269,7 +264,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> { | |||||||
|     /// Aborts all the pending updates, and not the one being currently processed. |     /// Aborts all the pending updates, and not the one being currently processed. | ||||||
|     /// Returns the update metas and ids that were successfully aborted. |     /// Returns the update metas and ids that were successfully aborted. | ||||||
|     pub fn abort_pendings(&self) -> heed::Result<Vec<(u64, M)>> |     pub fn abort_pendings(&self) -> heed::Result<Vec<(u64, M)>> | ||||||
|     where M: Serialize + for<'a> Deserialize<'a>, |     where | ||||||
|  |         M: Serialize + for<'a> Deserialize<'a>, | ||||||
|     { |     { | ||||||
|         let mut wtxn = self.env.write_txn()?; |         let mut wtxn = self.env.write_txn()?; | ||||||
|         let mut aborted_updates = Vec::new(); |         let mut aborted_updates = Vec::new(); | ||||||
| @@ -303,17 +299,19 @@ pub enum UpdateStatusMeta<M, N> { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |  | ||||||
|     use std::thread; |     use std::thread; | ||||||
|     use std::time::{Duration, Instant}; |     use std::time::{Duration, Instant}; | ||||||
|  |  | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn simple() { |     fn simple() { | ||||||
|         let dir = tempfile::tempdir().unwrap(); |         let dir = tempfile::tempdir().unwrap(); | ||||||
|         let options = EnvOpenOptions::new(); |         let options = EnvOpenOptions::new(); | ||||||
|         let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| { |         let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { | ||||||
|             Ok(meta + " processed") |             Ok(meta + " processed") | ||||||
|         }).unwrap(); |         }) | ||||||
|  |         .unwrap(); | ||||||
|  |  | ||||||
|         let meta = String::from("kiki"); |         let meta = String::from("kiki"); | ||||||
|         let update_id = update_store.register_update(&meta, &[]).unwrap(); |         let update_id = update_store.register_update(&meta, &[]).unwrap(); | ||||||
| @@ -329,10 +327,11 @@ mod tests { | |||||||
|     fn long_running_update() { |     fn long_running_update() { | ||||||
|         let dir = tempfile::tempdir().unwrap(); |         let dir = tempfile::tempdir().unwrap(); | ||||||
|         let options = EnvOpenOptions::new(); |         let options = EnvOpenOptions::new(); | ||||||
|         let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| { |         let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { | ||||||
|             thread::sleep(Duration::from_millis(400)); |             thread::sleep(Duration::from_millis(400)); | ||||||
|             Ok(meta + " processed") |             Ok(meta + " processed") | ||||||
|         }).unwrap(); |         }) | ||||||
|  |         .unwrap(); | ||||||
|  |  | ||||||
|         let before_register = Instant::now(); |         let before_register = Instant::now(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,16 +1,14 @@ | |||||||
| use std::fmt::Write as _; | use std::fmt::Write as _; | ||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
| use std::{str, io, fmt}; | use std::{fmt, io, str}; | ||||||
|  |  | ||||||
| use anyhow::Context; | use anyhow::Context; | ||||||
| use byte_unit::Byte; | use byte_unit::Byte; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use structopt::StructOpt; |  | ||||||
|  |  | ||||||
| use milli::facet::FacetType; | use milli::facet::FacetType; | ||||||
| use milli::index::db_name::*; | use milli::index::db_name::*; | ||||||
| use milli::{Index, TreeLevel}; | use milli::{Index, TreeLevel}; | ||||||
|  | use structopt::StructOpt; | ||||||
| use Command::*; | use Command::*; | ||||||
|  |  | ||||||
| #[cfg(target_os = "linux")] | #[cfg(target_os = "linux")] | ||||||
| @@ -257,53 +255,55 @@ fn main() -> anyhow::Result<()> { | |||||||
|         WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), |         WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), | ||||||
|         WordsPrefixesDocids { full_display, prefixes } => { |         WordsPrefixesDocids { full_display, prefixes } => { | ||||||
|             words_prefixes_docids(&index, &rtxn, !full_display, prefixes) |             words_prefixes_docids(&index, &rtxn, !full_display, prefixes) | ||||||
|         }, |         } | ||||||
|         FacetNumbersDocids { full_display, field_name } => { |         FacetNumbersDocids { full_display, field_name } => { | ||||||
|             facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) |             facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) | ||||||
|         }, |         } | ||||||
|         FacetStringsDocids { full_display, field_name } => { |         FacetStringsDocids { full_display, field_name } => { | ||||||
|             facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) |             facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) | ||||||
|         }, |         } | ||||||
|         WordsLevelPositionsDocids { full_display, words } => { |         WordsLevelPositionsDocids { full_display, words } => { | ||||||
|             words_level_positions_docids(&index, &rtxn, !full_display, words) |             words_level_positions_docids(&index, &rtxn, !full_display, words) | ||||||
|         }, |         } | ||||||
|         WordPrefixesLevelPositionsDocids { full_display, prefixes } => { |         WordPrefixesLevelPositionsDocids { full_display, prefixes } => { | ||||||
|             word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) |             word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) | ||||||
|         }, |         } | ||||||
|         FieldIdWordCountDocids { full_display, field_name } => { |         FieldIdWordCountDocids { full_display, field_name } => { | ||||||
|             field_id_word_count_docids(&index, &rtxn, !full_display, field_name) |             field_id_word_count_docids(&index, &rtxn, !full_display, field_name) | ||||||
|         }, |         } | ||||||
|         DocidsWordsPositions { full_display, internal_documents_ids } => { |         DocidsWordsPositions { full_display, internal_documents_ids } => { | ||||||
|             docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) |             docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) | ||||||
|         }, |         } | ||||||
|         FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), |         FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), | ||||||
|         AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), |         AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), | ||||||
|         AverageNumberOfPositionsByWord => { |         AverageNumberOfPositionsByWord => average_number_of_positions_by_word(&index, &rtxn), | ||||||
|             average_number_of_positions_by_word(&index, &rtxn) |  | ||||||
|         }, |  | ||||||
|         SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), |         SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), | ||||||
|         DatabaseStats { database } => database_stats(&index, &rtxn, &database), |         DatabaseStats { database } => database_stats(&index, &rtxn, &database), | ||||||
|         WordPairProximitiesDocids { full_display, word1, word2 } => { |         WordPairProximitiesDocids { full_display, word1, word2 } => { | ||||||
|             word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) |             word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) | ||||||
|         }, |         } | ||||||
|         ExportWordsFst => export_words_fst(&index, &rtxn), |         ExportWordsFst => export_words_fst(&index, &rtxn), | ||||||
|         ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), |         ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), | ||||||
|         ExportDocuments { internal_documents_ids } => { |         ExportDocuments { internal_documents_ids } => { | ||||||
|             export_documents(&index, &rtxn, internal_documents_ids) |             export_documents(&index, &rtxn, internal_documents_ids) | ||||||
|         }, |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | ||||||
|     use std::collections::BinaryHeap; |  | ||||||
|     use std::cmp::Reverse; |     use std::cmp::Reverse; | ||||||
|  |     use std::collections::BinaryHeap; | ||||||
|  |  | ||||||
|     let mut heap = BinaryHeap::with_capacity(limit + 1); |     let mut heap = BinaryHeap::with_capacity(limit + 1); | ||||||
|     for result in index.word_docids.iter(rtxn)? { |     for result in index.word_docids.iter(rtxn)? { | ||||||
|         if limit == 0 { break } |         if limit == 0 { | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|         let (word, docids) = result?; |         let (word, docids) = result?; | ||||||
|         heap.push((Reverse(docids.len()), word)); |         heap.push((Reverse(docids.len()), word)); | ||||||
|         if heap.len() > limit { heap.pop(); } |         if heap.len() > limit { | ||||||
|  |             heap.pop(); | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
| @@ -323,7 +323,7 @@ fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( | |||||||
|     rtxn: &'txn heed::RoTxn, |     rtxn: &'txn heed::RoTxn, | ||||||
|     db: heed::Database<KC, DC>, |     db: heed::Database<KC, DC>, | ||||||
|     field_id: u8, |     field_id: u8, | ||||||
| ) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(KC::DItem, DC::DItem)>> + 'txn>> | ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<(KC::DItem, DC::DItem)>> + 'txn>> | ||||||
| where | where | ||||||
|     KC: heed::BytesDecode<'txn>, |     KC: heed::BytesDecode<'txn>, | ||||||
|     DC: heed::BytesDecode<'txn>, |     DC: heed::BytesDecode<'txn>, | ||||||
| @@ -347,7 +347,8 @@ fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> | |||||||
| fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | ||||||
|     use std::cmp::Reverse; |     use std::cmp::Reverse; | ||||||
|     use std::collections::BinaryHeap; |     use std::collections::BinaryHeap; | ||||||
|     use heed::types::{Str, ByteSlice}; |  | ||||||
|  |     use heed::types::{ByteSlice, Str}; | ||||||
|  |  | ||||||
|     let Index { |     let Index { | ||||||
|         env: _env, |         env: _env, | ||||||
| @@ -387,71 +388,93 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | |||||||
|         let words_fst = index.words_fst(rtxn)?; |         let words_fst = index.words_fst(rtxn)?; | ||||||
|         let length = words_fst.as_fst().as_bytes().len(); |         let length = words_fst.as_fst().as_bytes().len(); | ||||||
|         heap.push(Reverse((length, format!("words-fst"), main_name))); |         heap.push(Reverse((length, format!("words-fst"), main_name))); | ||||||
|         if heap.len() > limit { heap.pop(); } |         if heap.len() > limit { | ||||||
|  |             heap.pop(); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         // Fetch the word prefix FST |         // Fetch the word prefix FST | ||||||
|         let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; |         let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; | ||||||
|         let length = words_prefixes_fst.as_fst().as_bytes().len(); |         let length = words_prefixes_fst.as_fst().as_bytes().len(); | ||||||
|         heap.push(Reverse((length, format!("words-prefixes-fst"), main_name))); |         heap.push(Reverse((length, format!("words-prefixes-fst"), main_name))); | ||||||
|         if heap.len() > limit { heap.pop(); } |         if heap.len() > limit { | ||||||
|  |             heap.pop(); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { |         if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { | ||||||
|             heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); |             heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let (word, value) = result?; |             let (word, value) = result?; | ||||||
|             heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); |             heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let (word, value) = result?; |             let (word, value) = result?; | ||||||
|             heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); |             heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let ((docid, word), value) = result?; |             let ((docid, word), value) = result?; | ||||||
|             let key = format!("{} {}", docid, word); |             let key = format!("{} {}", docid, word); | ||||||
|             heap.push(Reverse((value.len(), key, docid_word_positions_name))); |             heap.push(Reverse((value.len(), key, docid_word_positions_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let ((word1, word2, prox), value) = result?; |             let ((word1, word2, prox), value) = result?; | ||||||
|             let key = format!("{} {} {}", word1, word2, prox); |             let key = format!("{} {} {}", word1, word2, prox); | ||||||
|             heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); |             heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let ((word, prefix, prox), value) = result?; |             let ((word, prefix, prox), value) = result?; | ||||||
|             let key = format!("{} {} {}", word, prefix, prox); |             let key = format!("{} {} {}", word, prefix, prox); | ||||||
|             heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name))); |             heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let ((word, level, left, right), value) = result?; |             let ((word, level, left, right), value) = result?; | ||||||
|             let key = format!("{} {} {:?}", word, level, left..=right); |             let key = format!("{} {} {:?}", word, level, left..=right); | ||||||
|             heap.push(Reverse((value.len(), key, word_level_position_docids_name))); |             heap.push(Reverse((value.len(), key, word_level_position_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let ((word, level, left, right), value) = result?; |             let ((word, level, left, right), value) = result?; | ||||||
|             let key = format!("{} {} {:?}", word, level, left..=right); |             let key = format!("{} {} {:?}", word, level, left..=right); | ||||||
|             heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); |             heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let ((field_id, word_count), docids) = result?; |             let ((field_id, word_count), docids) = result?; | ||||||
|             let key = format!("{} {}", field_id, word_count); |             let key = format!("{} {}", field_id, word_count); | ||||||
|             heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); |             heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let faceted_fields = index.faceted_fields_ids(rtxn)?; |         let faceted_fields = index.faceted_fields_ids(rtxn)?; | ||||||
| @@ -468,7 +491,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | |||||||
|                 write!(&mut output, " (level {})", level)?; |                 write!(&mut output, " (level {})", level)?; | ||||||
|                 let key = format!("{} {}", facet_name, output); |                 let key = format!("{} {}", facet_name, output); | ||||||
|                 heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); |                 heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); | ||||||
|                 if heap.len() > limit { heap.pop(); } |                 if heap.len() > limit { | ||||||
|  |                     heap.pop(); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // List the facet strings of this facet id. |             // List the facet strings of this facet id. | ||||||
| @@ -477,14 +502,18 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | |||||||
|                 let ((_fid, fvalue), value) = result?; |                 let ((_fid, fvalue), value) = result?; | ||||||
|                 let key = format!("{} {}", facet_name, fvalue); |                 let key = format!("{} {}", facet_name, fvalue); | ||||||
|                 heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); |                 heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); | ||||||
|                 if heap.len() > limit { heap.pop(); } |                 if heap.len() > limit { | ||||||
|  |                     heap.pop(); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? { |         for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let (id, value) = result?; |             let (id, value) = result?; | ||||||
|             heap.push(Reverse((value.len(), id.to_string(), documents_name))); |             heap.push(Reverse((value.len(), id.to_string(), documents_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { | ||||||
|  |                 heap.pop(); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -499,7 +528,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | |||||||
|     Ok(wtr.flush()?) |     Ok(wtr.flush()?) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<String>) -> anyhow::Result<()> { | fn words_docids( | ||||||
|  |     index: &Index, | ||||||
|  |     rtxn: &heed::RoTxn, | ||||||
|  |     debug: bool, | ||||||
|  |     words: Vec<String>, | ||||||
|  | ) -> anyhow::Result<()> { | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); |     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||||
|     wtr.write_record(&["word", "documents_ids"])?; |     wtr.write_record(&["word", "documents_ids"])?; | ||||||
| @@ -523,8 +557,7 @@ fn words_prefixes_docids( | |||||||
|     rtxn: &heed::RoTxn, |     rtxn: &heed::RoTxn, | ||||||
|     debug: bool, |     debug: bool, | ||||||
|     prefixes: Vec<String>, |     prefixes: Vec<String>, | ||||||
| ) -> anyhow::Result<()> | ) -> anyhow::Result<()> { | ||||||
| { |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); |     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||||
|     wtr.write_record(&["prefix", "documents_ids"])?; |     wtr.write_record(&["prefix", "documents_ids"])?; | ||||||
| @@ -561,12 +594,12 @@ fn facet_values_docids( | |||||||
|     debug: bool, |     debug: bool, | ||||||
|     facet_type: FacetType, |     facet_type: FacetType, | ||||||
|     field_name: String, |     field_name: String, | ||||||
| ) -> anyhow::Result<()> | ) -> anyhow::Result<()> { | ||||||
| { |  | ||||||
|     let fields_ids_map = index.fields_ids_map(&rtxn)?; |     let fields_ids_map = index.fields_ids_map(&rtxn)?; | ||||||
|     let faceted_fields = index.faceted_fields_ids(&rtxn)?; |     let faceted_fields = index.faceted_fields_ids(&rtxn)?; | ||||||
|  |  | ||||||
|     let field_id = fields_ids_map.id(&field_name) |     let field_id = fields_ids_map | ||||||
|  |         .id(&field_name) | ||||||
|         .with_context(|| format!("field {} not found", field_name))?; |         .with_context(|| format!("field {} not found", field_name))?; | ||||||
|  |  | ||||||
|     if !faceted_fields.contains(&field_id) { |     if !faceted_fields.contains(&field_id) { | ||||||
| @@ -590,7 +623,7 @@ fn facet_values_docids( | |||||||
|                 }; |                 }; | ||||||
|                 wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; |                 wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|         FacetType::String => { |         FacetType::String => { | ||||||
|             wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; |             wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; | ||||||
|             for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { |             for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { | ||||||
| @@ -614,8 +647,7 @@ fn words_level_positions_docids( | |||||||
|     rtxn: &heed::RoTxn, |     rtxn: &heed::RoTxn, | ||||||
|     debug: bool, |     debug: bool, | ||||||
|     words: Vec<String>, |     words: Vec<String>, | ||||||
| ) -> anyhow::Result<()> | ) -> anyhow::Result<()> { | ||||||
| { |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); |     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||||
|     wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; |     wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; | ||||||
| @@ -653,8 +685,7 @@ fn word_prefixes_level_positions_docids( | |||||||
|     rtxn: &heed::RoTxn, |     rtxn: &heed::RoTxn, | ||||||
|     debug: bool, |     debug: bool, | ||||||
|     prefixes: Vec<String>, |     prefixes: Vec<String>, | ||||||
| ) -> anyhow::Result<()> | ) -> anyhow::Result<()> { | ||||||
| { |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); |     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||||
|     wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; |     wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; | ||||||
| @@ -691,21 +722,20 @@ fn field_id_word_count_docids( | |||||||
|     index: &Index, |     index: &Index, | ||||||
|     rtxn: &heed::RoTxn, |     rtxn: &heed::RoTxn, | ||||||
|     debug: bool, |     debug: bool, | ||||||
|     field_name: String |     field_name: String, | ||||||
| ) -> anyhow::Result<()> | ) -> anyhow::Result<()> { | ||||||
| { |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); |     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||||
|     wtr.write_record(&["field_name", "word_count", "docids"])?; |     wtr.write_record(&["field_name", "word_count", "docids"])?; | ||||||
|  |  | ||||||
|     let field_id = index.fields_ids_map(rtxn)? |     let field_id = index | ||||||
|  |         .fields_ids_map(rtxn)? | ||||||
|         .id(&field_name) |         .id(&field_name) | ||||||
|         .with_context(|| format!("unknown field name: {}", &field_name))?; |         .with_context(|| format!("unknown field name: {}", &field_name))?; | ||||||
|  |  | ||||||
|     let left = (field_id, 0); |     let left = (field_id, 0); | ||||||
|     let right = (field_id, u8::max_value()); |     let right = (field_id, u8::max_value()); | ||||||
|     let iter = index.field_id_word_count_docids |     let iter = index.field_id_word_count_docids.range(rtxn, &(left..=right))?; | ||||||
|         .range(rtxn, &(left..=right))?; |  | ||||||
|  |  | ||||||
|     for result in iter { |     for result in iter { | ||||||
|         let ((_, word_count), docids) = result?; |         let ((_, word_count), docids) = result?; | ||||||
| @@ -725,8 +755,7 @@ fn docids_words_positions( | |||||||
|     rtxn: &heed::RoTxn, |     rtxn: &heed::RoTxn, | ||||||
|     debug: bool, |     debug: bool, | ||||||
|     internal_ids: Vec<u32>, |     internal_ids: Vec<u32>, | ||||||
| ) -> anyhow::Result<()> | ) -> anyhow::Result<()> { | ||||||
| { |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); |     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||||
|     wtr.write_record(&["document_id", "word", "positions"])?; |     wtr.write_record(&["document_id", "word", "positions"])?; | ||||||
| @@ -734,9 +763,10 @@ fn docids_words_positions( | |||||||
|     let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() { |     let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() { | ||||||
|         Box::new(index.docid_word_positions.iter(rtxn)?) |         Box::new(index.docid_word_positions.iter(rtxn)?) | ||||||
|     } else { |     } else { | ||||||
|         let vec: heed::Result<Vec<_>> = internal_ids.into_iter().map(|id| { |         let vec: heed::Result<Vec<_>> = internal_ids | ||||||
|             index.docid_word_positions.prefix_iter(rtxn, &(id, "")) |             .into_iter() | ||||||
|         }).collect(); |             .map(|id| index.docid_word_positions.prefix_iter(rtxn, &(id, ""))) | ||||||
|  |             .collect(); | ||||||
|         Box::new(vec?.into_iter().flatten()) |         Box::new(vec?.into_iter().flatten()) | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
| @@ -757,7 +787,8 @@ fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> | |||||||
|     let fields_ids_map = index.fields_ids_map(&rtxn)?; |     let fields_ids_map = index.fields_ids_map(&rtxn)?; | ||||||
|     let faceted_fields = index.faceted_fields_ids(&rtxn)?; |     let faceted_fields = index.faceted_fields_ids(&rtxn)?; | ||||||
|  |  | ||||||
|     let field_id = fields_ids_map.id(&field_name) |     let field_id = fields_ids_map | ||||||
|  |         .id(&field_name) | ||||||
|         .with_context(|| format!("field {} not found", field_name))?; |         .with_context(|| format!("field {} not found", field_name))?; | ||||||
|  |  | ||||||
|     if !faceted_fields.contains(&field_id) { |     if !faceted_fields.contains(&field_id) { | ||||||
| @@ -808,9 +839,14 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result< | |||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -> anyhow::Result<()> { | fn export_documents( | ||||||
|  |     index: &Index, | ||||||
|  |     rtxn: &heed::RoTxn, | ||||||
|  |     internal_ids: Vec<u32>, | ||||||
|  | ) -> anyhow::Result<()> { | ||||||
|     use std::io::{BufWriter, Write as _}; |     use std::io::{BufWriter, Write as _}; | ||||||
|     use milli::{BEU32, obkv_to_json}; |  | ||||||
|  |     use milli::{obkv_to_json, BEU32}; | ||||||
|  |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut out = BufWriter::new(stdout); |     let mut out = BufWriter::new(stdout); | ||||||
| @@ -819,13 +855,13 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) - | |||||||
|     let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); |     let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); | ||||||
|  |  | ||||||
|     let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() { |     let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() { | ||||||
|         Box::new(index.documents.iter(rtxn)?.map(|result| { |         Box::new(index.documents.iter(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv))) | ||||||
|             result.map(|(_id, obkv)| obkv) |  | ||||||
|         })) |  | ||||||
|     } else { |     } else { | ||||||
|         Box::new(internal_ids.into_iter().flat_map(|id| { |         Box::new( | ||||||
|             index.documents.get(rtxn, &BEU32::new(id)).transpose() |             internal_ids | ||||||
|         })) |                 .into_iter() | ||||||
|  |                 .flat_map(|id| index.documents.get(rtxn, &BEU32::new(id)).transpose()), | ||||||
|  |         ) | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     for result in iter { |     for result in iter { | ||||||
| @@ -842,26 +878,27 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) - | |||||||
|  |  | ||||||
| fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { | fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { | ||||||
|     use heed::types::DecodeIgnore; |     use heed::types::DecodeIgnore; | ||||||
|     use milli::{DocumentId, BEU32StrCodec}; |     use milli::{BEU32StrCodec, DocumentId}; | ||||||
|  |  | ||||||
|     let mut words_counts = Vec::new(); |     let mut words_counts = Vec::new(); | ||||||
|     let mut count = 0; |     let mut count = 0; | ||||||
|     let mut prev = None as Option<(DocumentId, u32)>; |     let mut prev = None as Option<(DocumentId, u32)>; | ||||||
|  |  | ||||||
|     let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; |     let iter = | ||||||
|  |         index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; | ||||||
|     for result in iter { |     for result in iter { | ||||||
|         let ((docid, _word), ()) = result?; |         let ((docid, _word), ()) = result?; | ||||||
|  |  | ||||||
|         match prev.as_mut() { |         match prev.as_mut() { | ||||||
|             Some((prev_docid, prev_count)) if docid == *prev_docid => { |             Some((prev_docid, prev_count)) if docid == *prev_docid => { | ||||||
|                 *prev_count += 1; |                 *prev_count += 1; | ||||||
|             }, |             } | ||||||
|             Some((prev_docid, prev_count)) => { |             Some((prev_docid, prev_count)) => { | ||||||
|                 words_counts.push(*prev_count); |                 words_counts.push(*prev_count); | ||||||
|                 *prev_docid = docid; |                 *prev_docid = docid; | ||||||
|                 *prev_count = 0; |                 *prev_count = 0; | ||||||
|                 count += 1; |                 count += 1; | ||||||
|             }, |             } | ||||||
|             None => prev = Some((docid, 1)), |             None => prev = Some((docid, 1)), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -970,16 +1007,15 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a | |||||||
|  |  | ||||||
| fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { | fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { | ||||||
|     use heed::types::ByteSlice; |     use heed::types::ByteSlice; | ||||||
|     use heed::{Error, BytesDecode}; |     use heed::{BytesDecode, Error}; | ||||||
|     use roaring::RoaringBitmap; |  | ||||||
|     use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; |     use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; | ||||||
|  |     use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|     fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>( |     fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>( | ||||||
|         db: heed::PolyDatabase, |         db: heed::PolyDatabase, | ||||||
|         rtxn: &'a heed::RoTxn, |         rtxn: &'a heed::RoTxn, | ||||||
|         name: &str, |         name: &str, | ||||||
|     ) -> anyhow::Result<()> |     ) -> anyhow::Result<()> { | ||||||
|     { |  | ||||||
|         let mut key_size = 0u64; |         let mut key_size = 0u64; | ||||||
|         let mut val_size = 0u64; |         let mut val_size = 0u64; | ||||||
|         let mut values_length = Vec::new(); |         let mut values_length = Vec::new(); | ||||||
| @@ -1028,27 +1064,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu | |||||||
|         WORD_DOCIDS => { |         WORD_DOCIDS => { | ||||||
|             let db = index.word_docids.as_polymorph(); |             let db = index.word_docids.as_polymorph(); | ||||||
|             compute_stats::<RoaringBitmapCodec>(*db, rtxn, name) |             compute_stats::<RoaringBitmapCodec>(*db, rtxn, name) | ||||||
|         }, |         } | ||||||
|         WORD_PREFIX_DOCIDS => { |         WORD_PREFIX_DOCIDS => { | ||||||
|             let db = index.word_prefix_docids.as_polymorph(); |             let db = index.word_prefix_docids.as_polymorph(); | ||||||
|             compute_stats::<RoaringBitmapCodec>(*db, rtxn, name) |             compute_stats::<RoaringBitmapCodec>(*db, rtxn, name) | ||||||
|         }, |         } | ||||||
|         DOCID_WORD_POSITIONS => { |         DOCID_WORD_POSITIONS => { | ||||||
|             let db = index.docid_word_positions.as_polymorph(); |             let db = index.docid_word_positions.as_polymorph(); | ||||||
|             compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name) |             compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name) | ||||||
|         }, |         } | ||||||
|         WORD_PAIR_PROXIMITY_DOCIDS => { |         WORD_PAIR_PROXIMITY_DOCIDS => { | ||||||
|             let db = index.word_pair_proximity_docids.as_polymorph(); |             let db = index.word_pair_proximity_docids.as_polymorph(); | ||||||
|             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) |             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) | ||||||
|         }, |         } | ||||||
|         WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { |         WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { | ||||||
|             let db = index.word_prefix_pair_proximity_docids.as_polymorph(); |             let db = index.word_prefix_pair_proximity_docids.as_polymorph(); | ||||||
|             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) |             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) | ||||||
|         }, |         } | ||||||
|         FIELD_ID_WORD_COUNT_DOCIDS => { |         FIELD_ID_WORD_COUNT_DOCIDS => { | ||||||
|             let db = index.field_id_word_count_docids.as_polymorph(); |             let db = index.field_id_word_count_docids.as_polymorph(); | ||||||
|             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) |             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) | ||||||
|         }, |         } | ||||||
|         unknown => anyhow::bail!("unknown database {:?}", unknown), |         unknown => anyhow::bail!("unknown database {:?}", unknown), | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -1059,8 +1095,7 @@ fn word_pair_proximities_docids( | |||||||
|     debug: bool, |     debug: bool, | ||||||
|     word1: String, |     word1: String, | ||||||
|     word2: String, |     word2: String, | ||||||
| ) -> anyhow::Result<()> | ) -> anyhow::Result<()> { | ||||||
| { |  | ||||||
|     use heed::types::ByteSlice; |     use heed::types::ByteSlice; | ||||||
|     use milli::RoaringBitmapCodec; |     use milli::RoaringBitmapCodec; | ||||||
|  |  | ||||||
| @@ -1081,7 +1116,9 @@ fn word_pair_proximities_docids( | |||||||
|  |  | ||||||
|         // Skip keys that are longer than the requested one, |         // Skip keys that are longer than the requested one, | ||||||
|         // a longer key means that the second word is a prefix of the request word. |         // a longer key means that the second word is a prefix of the request word. | ||||||
|         if key.len() != prefix.len() + 1 { continue; } |         if key.len() != prefix.len() + 1 { | ||||||
|  |             continue; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         let proximity = key.last().unwrap(); |         let proximity = key.last().unwrap(); | ||||||
|         let docids = if debug { |         let docids = if debug { | ||||||
|   | |||||||
| @@ -1,15 +1,14 @@ | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
|  |  | ||||||
| use regex::Regex; |  | ||||||
| use serde::{Serialize, Deserialize}; |  | ||||||
| use once_cell::sync::Lazy; | use once_cell::sync::Lazy; | ||||||
|  | use regex::Regex; | ||||||
|  | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| use crate::error::{Error, UserError}; | use crate::error::{Error, UserError}; | ||||||
|  |  | ||||||
| static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| { | static ASC_DESC_REGEX: Lazy<Regex> = | ||||||
|     Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() |     Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()); | ||||||
| }); |  | ||||||
|  |  | ||||||
| #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] | #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] | ||||||
| pub enum Criterion { | pub enum Criterion { | ||||||
| @@ -52,17 +51,21 @@ impl FromStr for Criterion { | |||||||
|             "attribute" => Ok(Criterion::Attribute), |             "attribute" => Ok(Criterion::Attribute), | ||||||
|             "exactness" => Ok(Criterion::Exactness), |             "exactness" => Ok(Criterion::Exactness), | ||||||
|             text => { |             text => { | ||||||
|                 let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| { |                 let caps = ASC_DESC_REGEX | ||||||
|                     UserError::InvalidCriterionName { name: text.to_string() } |                     .captures(text) | ||||||
|                 })?; |                     .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; | ||||||
|                 let order = caps.get(1).unwrap().as_str(); |                 let order = caps.get(1).unwrap().as_str(); | ||||||
|                 let field_name = caps.get(2).unwrap().as_str(); |                 let field_name = caps.get(2).unwrap().as_str(); | ||||||
|                 match order { |                 match order { | ||||||
|                     "asc" => Ok(Criterion::Asc(field_name.to_string())), |                     "asc" => Ok(Criterion::Asc(field_name.to_string())), | ||||||
|                     "desc" => Ok(Criterion::Desc(field_name.to_string())), |                     "desc" => Ok(Criterion::Desc(field_name.to_string())), | ||||||
|                     text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), |                     text => { | ||||||
|  |                         return Err( | ||||||
|  |                             UserError::InvalidCriterionName { name: text.to_string() }.into() | ||||||
|  |                         ) | ||||||
|  |                     } | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -82,13 +85,13 @@ impl fmt::Display for Criterion { | |||||||
|         use Criterion::*; |         use Criterion::*; | ||||||
|  |  | ||||||
|         match self { |         match self { | ||||||
|             Words           => f.write_str("words"), |             Words => f.write_str("words"), | ||||||
|             Typo            => f.write_str("typo"), |             Typo => f.write_str("typo"), | ||||||
|             Proximity       => f.write_str("proximity"), |             Proximity => f.write_str("proximity"), | ||||||
|             Attribute       => f.write_str("attribute"), |             Attribute => f.write_str("attribute"), | ||||||
|             Exactness       => f.write_str("exactness"), |             Exactness => f.write_str("exactness"), | ||||||
|             Asc(attr)       => write!(f, "asc({})", attr), |             Asc(attr) => write!(f, "asc({})", attr), | ||||||
|             Desc(attr)      => write!(f, "desc({})", attr), |             Desc(attr) => write!(f, "desc({})", attr), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ use std::convert::Infallible; | |||||||
| use std::error::Error as StdError; | use std::error::Error as StdError; | ||||||
| use std::{fmt, io, str}; | use std::{fmt, io, str}; | ||||||
|  |  | ||||||
| use heed::{MdbError, Error as HeedError}; | use heed::{Error as HeedError, MdbError}; | ||||||
| use rayon::ThreadPoolBuildError; | use rayon::ThreadPoolBuildError; | ||||||
| use serde_json::{Map, Value}; | use serde_json::{Map, Value}; | ||||||
|  |  | ||||||
| @@ -80,14 +80,17 @@ impl From<fst::Error> for Error { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<E> From<grenad::Error<E>> for Error where Error: From<E> { | impl<E> From<grenad::Error<E>> for Error | ||||||
|  | where | ||||||
|  |     Error: From<E>, | ||||||
|  | { | ||||||
|     fn from(error: grenad::Error<E>) -> Error { |     fn from(error: grenad::Error<E>) -> Error { | ||||||
|         match error { |         match error { | ||||||
|             grenad::Error::Io(error) => Error::IoError(error), |             grenad::Error::Io(error) => Error::IoError(error), | ||||||
|             grenad::Error::Merge(error) => Error::from(error), |             grenad::Error::Merge(error) => Error::from(error), | ||||||
|             grenad::Error::InvalidCompressionType => { |             grenad::Error::InvalidCompressionType => { | ||||||
|                 Error::InternalError(InternalError::GrenadInvalidCompressionType) |                 Error::InternalError(InternalError::GrenadInvalidCompressionType) | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -171,15 +174,15 @@ impl fmt::Display for InternalError { | |||||||
|         match self { |         match self { | ||||||
|             Self::DatabaseMissingEntry { db_name, key } => { |             Self::DatabaseMissingEntry { db_name, key } => { | ||||||
|                 write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) |                 write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) | ||||||
|             }, |             } | ||||||
|             Self::FieldIdMapMissingEntry(error) => error.fmt(f), |             Self::FieldIdMapMissingEntry(error) => error.fmt(f), | ||||||
|             Self::Fst(error) => error.fmt(f), |             Self::Fst(error) => error.fmt(f), | ||||||
|             Self::GrenadInvalidCompressionType => { |             Self::GrenadInvalidCompressionType => { | ||||||
|                 f.write_str("invalid compression type have been specified to grenad") |                 f.write_str("invalid compression type have been specified to grenad") | ||||||
|             }, |             } | ||||||
|             Self::IndexingMergingKeys { process } => { |             Self::IndexingMergingKeys { process } => { | ||||||
|                 write!(f, "invalid merge while processing {}", process) |                 write!(f, "invalid merge while processing {}", process) | ||||||
|             }, |             } | ||||||
|             Self::Serialization(error) => error.fmt(f), |             Self::Serialization(error) => error.fmt(f), | ||||||
|             Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), |             Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), | ||||||
|             Self::RayonThreadPool(error) => error.fmt(f), |             Self::RayonThreadPool(error) => error.fmt(f), | ||||||
| @@ -204,12 +207,12 @@ impl fmt::Display for UserError { | |||||||
|             Self::InvalidDocumentId { document_id } => { |             Self::InvalidDocumentId { document_id } => { | ||||||
|                 let json = serde_json::to_string(document_id).unwrap(); |                 let json = serde_json::to_string(document_id).unwrap(); | ||||||
|                 write!(f, "document identifier is invalid {}", json) |                 write!(f, "document identifier is invalid {}", json) | ||||||
|             }, |             } | ||||||
|             Self::InvalidFilterAttribute(error) => error.fmt(f), |             Self::InvalidFilterAttribute(error) => error.fmt(f), | ||||||
|             Self::MissingDocumentId { document } => { |             Self::MissingDocumentId { document } => { | ||||||
|                 let json = serde_json::to_string(document).unwrap(); |                 let json = serde_json::to_string(document).unwrap(); | ||||||
|                 write!(f, "document doesn't have an identifier {}", json) |                 write!(f, "document doesn't have an identifier {}", json) | ||||||
|             }, |             } | ||||||
|             Self::MissingPrimaryKey => f.write_str("missing primary key"), |             Self::MissingPrimaryKey => f.write_str("missing primary key"), | ||||||
|             Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), |             Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), | ||||||
|             // TODO where can we find it instead of writing the text ourselves? |             // TODO where can we find it instead of writing the text ourselves? | ||||||
| @@ -217,14 +220,14 @@ impl fmt::Display for UserError { | |||||||
|             Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), |             Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), | ||||||
|             Self::PrimaryKeyCannotBeChanged => { |             Self::PrimaryKeyCannotBeChanged => { | ||||||
|                 f.write_str("primary key cannot be changed if the database contains documents") |                 f.write_str("primary key cannot be changed if the database contains documents") | ||||||
|             }, |             } | ||||||
|             Self::PrimaryKeyCannotBeReset => { |             Self::PrimaryKeyCannotBeReset => { | ||||||
|                 f.write_str("primary key cannot be reset if the database contains documents") |                 f.write_str("primary key cannot be reset if the database contains documents") | ||||||
|             }, |             } | ||||||
|             Self::SerdeJson(error) => error.fmt(f), |             Self::SerdeJson(error) => error.fmt(f), | ||||||
|             Self::UnknownInternalDocumentId { document_id } => { |             Self::UnknownInternalDocumentId { document_id } => { | ||||||
|                 write!(f, "an unknown internal document id have been used ({})", document_id) |                 write!(f, "an unknown internal document id have been used ({})", document_id) | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -236,10 +239,10 @@ impl fmt::Display for FieldIdMapMissingEntry { | |||||||
|         match self { |         match self { | ||||||
|             Self::FieldId { field_id, process } => { |             Self::FieldId { field_id, process } => { | ||||||
|                 write!(f, "unknown field id {} coming from the {} process", field_id, process) |                 write!(f, "unknown field id {} coming from the {} process", field_id, process) | ||||||
|             }, |             } | ||||||
|             Self::FieldName { field_name, process } => { |             Self::FieldName { field_name, process } => { | ||||||
|                 write!(f, "unknown field name {} coming from the {} process", field_name, process) |                 write!(f, "unknown field name {} coming from the {} process", field_name, process) | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -251,11 +254,11 @@ impl fmt::Display for SerializationError { | |||||||
|         match self { |         match self { | ||||||
|             Self::Decoding { db_name: Some(name) } => { |             Self::Decoding { db_name: Some(name) } => { | ||||||
|                 write!(f, "decoding from the {} database failed", name) |                 write!(f, "decoding from the {} database failed", name) | ||||||
|             }, |             } | ||||||
|             Self::Decoding { db_name: None } => f.write_str("decoding failed"), |             Self::Decoding { db_name: None } => f.write_str("decoding failed"), | ||||||
|             Self::Encoding { db_name: Some(name) } => { |             Self::Encoding { db_name: Some(name) } => { | ||||||
|                 write!(f, "encoding into the {} database failed", name) |                 write!(f, "encoding into the {} database failed", name) | ||||||
|             }, |             } | ||||||
|             Self::Encoding { db_name: None } => f.write_str("encoding failed"), |             Self::Encoding { db_name: None } => f.write_str("encoding failed"), | ||||||
|             Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), |             Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -1,6 +1,7 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use fst::{Streamer, IntoStreamer}; |  | ||||||
|  | use fst::{IntoStreamer, Streamer}; | ||||||
|  |  | ||||||
| pub struct ExternalDocumentsIds<'a> { | pub struct ExternalDocumentsIds<'a> { | ||||||
|     pub(crate) hard: fst::Map<Cow<'a, [u8]>>, |     pub(crate) hard: fst::Map<Cow<'a, [u8]>>, | ||||||
| @@ -8,7 +9,10 @@ pub struct ExternalDocumentsIds<'a> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> ExternalDocumentsIds<'a> { | impl<'a> ExternalDocumentsIds<'a> { | ||||||
|     pub fn new(hard: fst::Map<Cow<'a, [u8]>>, soft: fst::Map<Cow<'a, [u8]>>) -> ExternalDocumentsIds<'a> { |     pub fn new( | ||||||
|  |         hard: fst::Map<Cow<'a, [u8]>>, | ||||||
|  |         soft: fst::Map<Cow<'a, [u8]>>, | ||||||
|  |     ) -> ExternalDocumentsIds<'a> { | ||||||
|         ExternalDocumentsIds { hard, soft } |         ExternalDocumentsIds { hard, soft } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -29,7 +33,7 @@ impl<'a> ExternalDocumentsIds<'a> { | |||||||
|         match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { |         match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { | ||||||
|             // u64 MAX means deleted in the soft fst map |             // u64 MAX means deleted in the soft fst map | ||||||
|             Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), |             Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), | ||||||
|             _otherwise => None |             _otherwise => None, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,10 +2,9 @@ use std::error::Error; | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
|  |  | ||||||
| use serde::{Serialize, Deserialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] | #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] | ||||||
| #[derive(Serialize, Deserialize)] |  | ||||||
| pub enum FacetType { | pub enum FacetType { | ||||||
|     String, |     String, | ||||||
|     Number, |     Number, | ||||||
| @@ -43,4 +42,4 @@ impl fmt::Display for InvalidFacetType { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Error for InvalidFacetType { } | impl Error for InvalidFacetType {} | ||||||
|   | |||||||
| @@ -50,7 +50,7 @@ impl Serialize for FacetValue { | |||||||
|             FacetValue::Number(number) => { |             FacetValue::Number(number) => { | ||||||
|                 let string = number.to_string(); |                 let string = number.to_string(); | ||||||
|                 serializer.serialize_str(&string) |                 serializer.serialize_str(&string) | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -28,6 +28,7 @@ fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] { | |||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use std::cmp::Ordering::Less; |     use std::cmp::Ordering::Less; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     fn is_sorted<T: Ord>(x: &[T]) -> bool { |     fn is_sorted<T: Ord>(x: &[T]) -> bool { | ||||||
| @@ -39,8 +40,8 @@ mod tests { | |||||||
|         let a = -13_f64; |         let a = -13_f64; | ||||||
|         let b = -10.0; |         let b = -10.0; | ||||||
|         let c = -0.0; |         let c = -0.0; | ||||||
|         let d =  1.0; |         let d = 1.0; | ||||||
|         let e =  43.0; |         let e = 43.0; | ||||||
|  |  | ||||||
|         let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); |         let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); | ||||||
|         assert!(is_sorted(&vec), "{:?}", vec); |         assert!(is_sorted(&vec), "{:?}", vec); | ||||||
|   | |||||||
| @@ -1,5 +1,7 @@ | |||||||
| use std::collections::BTreeMap; | use std::collections::BTreeMap; | ||||||
| use serde::{Serialize, Deserialize}; |  | ||||||
|  | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| use crate::FieldId; | use crate::FieldId; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | #[derive(Debug, Clone, Serialize, Deserialize)] | ||||||
| @@ -11,11 +13,7 @@ pub struct FieldsIdsMap { | |||||||
|  |  | ||||||
| impl FieldsIdsMap { | impl FieldsIdsMap { | ||||||
|     pub fn new() -> FieldsIdsMap { |     pub fn new() -> FieldsIdsMap { | ||||||
|         FieldsIdsMap { |         FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) } | ||||||
|             names_ids: BTreeMap::new(), |  | ||||||
|             ids_names: BTreeMap::new(), |  | ||||||
|             next_id: Some(0), |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the number of fields ids in the map. |     /// Returns the number of fields ids in the map. | ||||||
| @@ -62,17 +60,17 @@ impl FieldsIdsMap { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Iterate over the ids and names in the ids order. |     /// Iterate over the ids and names in the ids order. | ||||||
|     pub fn iter(&self) -> impl Iterator<Item=(FieldId, &str)> { |     pub fn iter(&self) -> impl Iterator<Item = (FieldId, &str)> { | ||||||
|         self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) |         self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Iterate over the ids in the order of the ids. |     /// Iterate over the ids in the order of the ids. | ||||||
|     pub fn ids<'a>(&'a self) -> impl Iterator<Item=FieldId> + 'a { |     pub fn ids<'a>(&'a self) -> impl Iterator<Item = FieldId> + 'a { | ||||||
|         self.ids_names.keys().copied() |         self.ids_names.keys().copied() | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Iterate over the names in the order of the ids. |     /// Iterate over the names in the order of the ids. | ||||||
|     pub fn names(&self) -> impl Iterator<Item=&str> { |     pub fn names(&self) -> impl Iterator<Item = &str> { | ||||||
|         self.ids_names.values().map(AsRef::as_ref) |         self.ids_names.values().map(AsRef::as_ref) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -71,7 +71,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use heed::{BytesEncode, BytesDecode}; |     use heed::{BytesDecode, BytesEncode}; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|   | |||||||
| @@ -1,8 +1,8 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
|  |  | ||||||
| use crate::{FieldId, DocumentId}; |  | ||||||
| use crate::facet::value_encoding::f64_into_bytes; | use crate::facet::value_encoding::f64_into_bytes; | ||||||
|  | use crate::{DocumentId, FieldId}; | ||||||
|  |  | ||||||
| pub struct FieldDocIdFacetF64Codec; | pub struct FieldDocIdFacetF64Codec; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,12 +2,17 @@ use std::borrow::Cow; | |||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::str; | use std::str; | ||||||
|  |  | ||||||
| use crate::{FieldId, DocumentId}; | use crate::{DocumentId, FieldId}; | ||||||
|  |  | ||||||
| pub struct FieldDocIdFacetStringCodec; | pub struct FieldDocIdFacetStringCodec; | ||||||
|  |  | ||||||
| impl FieldDocIdFacetStringCodec { | impl FieldDocIdFacetStringCodec { | ||||||
|     pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec<u8>) { |     pub fn serialize_into( | ||||||
|  |         field_id: FieldId, | ||||||
|  |         document_id: DocumentId, | ||||||
|  |         value: &str, | ||||||
|  |         out: &mut Vec<u8>, | ||||||
|  |     ) { | ||||||
|         out.reserve(1 + 4 + value.len()); |         out.reserve(1 + 4 + value.len()); | ||||||
|         out.push(field_id); |         out.push(field_id); | ||||||
|         out.extend_from_slice(&document_id.to_be_bytes()); |         out.extend_from_slice(&document_id.to_be_bytes()); | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| use std::{borrow::Cow, convert::TryInto}; | use std::borrow::Cow; | ||||||
|  | use std::convert::TryInto; | ||||||
|  |  | ||||||
| use crate::FieldId; | use crate::FieldId; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,16 +1,18 @@ | |||||||
| mod beu32_str_codec; | mod beu32_str_codec; | ||||||
|  | pub mod facet; | ||||||
|  | mod field_id_word_count_codec; | ||||||
| mod obkv_codec; | mod obkv_codec; | ||||||
| mod roaring_bitmap; | mod roaring_bitmap; | ||||||
| mod roaring_bitmap_length; | mod roaring_bitmap_length; | ||||||
| mod str_level_position_codec; | mod str_level_position_codec; | ||||||
| mod str_str_u8_codec; | mod str_str_u8_codec; | ||||||
| mod field_id_word_count_codec; |  | ||||||
| pub mod facet; |  | ||||||
|  |  | ||||||
| pub use self::beu32_str_codec::BEU32StrCodec; | pub use self::beu32_str_codec::BEU32StrCodec; | ||||||
|  | pub use self::field_id_word_count_codec::FieldIdWordCountCodec; | ||||||
| pub use self::obkv_codec::ObkvCodec; | pub use self::obkv_codec::ObkvCodec; | ||||||
| pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; | pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; | ||||||
| pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; | pub use self::roaring_bitmap_length::{ | ||||||
|  |     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, | ||||||
|  | }; | ||||||
| pub use self::str_level_position_codec::StrLevelPositionCodec; | pub use self::str_level_position_codec::StrLevelPositionCodec; | ||||||
| pub use self::str_str_u8_codec::StrStrU8Codec; | pub use self::str_str_u8_codec::StrStrU8Codec; | ||||||
| pub use self::field_id_word_count_codec::FieldIdWordCountCodec; |  | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
|  |  | ||||||
| use obkv::{KvReader, KvWriter}; | use obkv::{KvReader, KvWriter}; | ||||||
|  |  | ||||||
| pub struct ObkvCodec; | pub struct ObkvCodec; | ||||||
|   | |||||||
| @@ -75,7 +75,9 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { | |||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use std::iter::FromIterator; |     use std::iter::FromIterator; | ||||||
|     use heed::{BytesEncode, BytesDecode}; |  | ||||||
|  |     use heed::{BytesDecode, BytesEncode}; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
|  |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| pub struct RoaringBitmapCodec; | pub struct RoaringBitmapCodec; | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| use std::io::{self, Read, BufRead}; | use std::io::{self, BufRead, Read}; | ||||||
| use std::mem; | use std::mem; | ||||||
|  |  | ||||||
| use byteorder::{ReadBytesExt, LittleEndian}; | use byteorder::{LittleEndian, ReadBytesExt}; | ||||||
|  |  | ||||||
| const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; | const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; | ||||||
| const SERIAL_COOKIE: u16 = 12347; | const SERIAL_COOKIE: u16 = 12347; | ||||||
| @@ -16,20 +16,14 @@ impl RoaringBitmapLenCodec { | |||||||
|             if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { |             if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { | ||||||
|                 (bytes.read_u32::<LittleEndian>()? as usize, true) |                 (bytes.read_u32::<LittleEndian>()? as usize, true) | ||||||
|             } else if (cookie as u16) == SERIAL_COOKIE { |             } else if (cookie as u16) == SERIAL_COOKIE { | ||||||
|                 return Err(io::Error::new( |                 return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported")); | ||||||
|                     io::ErrorKind::Other, |  | ||||||
|                     "run containers are unsupported", |  | ||||||
|                 )); |  | ||||||
|             } else { |             } else { | ||||||
|                 return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); |                 return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); | ||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         if size > u16::max_value() as usize + 1 { |         if size > u16::max_value() as usize + 1 { | ||||||
|             return Err(io::Error::new( |             return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); | ||||||
|                 io::ErrorKind::Other, |  | ||||||
|                 "size is greater than supported", |  | ||||||
|             )); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let mut description_bytes = vec![0u8; size * 4]; |         let mut description_bytes = vec![0u8; size * 4]; | ||||||
| @@ -67,12 +61,12 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     use crate::heed_codec::RoaringBitmapCodec; |  | ||||||
|     use heed::BytesEncode; |     use heed::BytesEncode; | ||||||
|     use roaring::RoaringBitmap; |     use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  |     use super::*; | ||||||
|  |     use crate::heed_codec::RoaringBitmapCodec; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn deserialize_roaring_bitmap_length() { |     fn deserialize_roaring_bitmap_length() { | ||||||
|         let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); |         let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); | ||||||
|   | |||||||
| @@ -13,7 +13,9 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { | |||||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { |     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||||
|         let footer_len = size_of::<u8>() + size_of::<u32>() * 2; |         let footer_len = size_of::<u8>() + size_of::<u32>() * 2; | ||||||
|  |  | ||||||
|         if bytes.len() < footer_len { return None } |         if bytes.len() < footer_len { | ||||||
|  |             return None; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         let (word, bytes) = bytes.split_at(bytes.len() - footer_len); |         let (word, bytes) = bytes.split_at(bytes.len() - footer_len); | ||||||
|         let word = str::from_utf8(word).ok()?; |         let word = str::from_utf8(word).ok()?; | ||||||
|   | |||||||
| @@ -3,23 +3,22 @@ use std::collections::{HashMap, HashSet}; | |||||||
| use std::path::Path; | use std::path::Path; | ||||||
|  |  | ||||||
| use chrono::{DateTime, Utc}; | use chrono::{DateTime, Utc}; | ||||||
| use heed::{Database, PolyDatabase, RoTxn, RwTxn}; |  | ||||||
| use heed::types::*; | use heed::types::*; | ||||||
|  | use heed::{Database, PolyDatabase, RoTxn, RwTxn}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::error::{UserError, FieldIdMapMissingEntry, InternalError}; | use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; | ||||||
| use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; |  | ||||||
| use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result}; |  | ||||||
| use crate::{ |  | ||||||
|     BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, |  | ||||||
|     ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, |  | ||||||
|     FieldIdWordCountCodec, |  | ||||||
| }; |  | ||||||
| use crate::heed_codec::facet::{ |  | ||||||
|     FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, |  | ||||||
|     FacetValueStringCodec, FacetLevelValueF64Codec, |  | ||||||
| }; |  | ||||||
| use crate::fields_ids_map::FieldsIdsMap; | use crate::fields_ids_map::FieldsIdsMap; | ||||||
|  | use crate::heed_codec::facet::{ | ||||||
|  |     FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, | ||||||
|  |     FieldDocIdFacetStringCodec, | ||||||
|  | }; | ||||||
|  | use crate::{ | ||||||
|  |     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, | ||||||
|  |     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec, | ||||||
|  |     FieldsDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, | ||||||
|  |     StrLevelPositionCodec, StrStrU8Codec, BEU32, | ||||||
|  | }; | ||||||
|  |  | ||||||
| pub mod main_key { | pub mod main_key { | ||||||
|     pub const CRITERIA_KEY: &str = "criteria"; |     pub const CRITERIA_KEY: &str = "criteria"; | ||||||
| @@ -114,14 +113,17 @@ impl Index { | |||||||
|         let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; |         let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; | ||||||
|         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; |         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; | ||||||
|         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; |         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; | ||||||
|         let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; |         let word_prefix_pair_proximity_docids = | ||||||
|  |             env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; | ||||||
|         let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; |         let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; | ||||||
|         let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; |         let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; | ||||||
|         let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; |         let word_prefix_level_position_docids = | ||||||
|  |             env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; | ||||||
|         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; |         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; | ||||||
|         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; |         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; | ||||||
|         let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; |         let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; | ||||||
|         let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; |         let field_id_docid_facet_strings = | ||||||
|  |             env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; | ||||||
|         let documents = env.create_database(Some(DOCUMENTS))?; |         let documents = env.create_database(Some(DOCUMENTS))?; | ||||||
|  |  | ||||||
|         Index::initialize_creation_dates(&env, main)?; |         Index::initialize_creation_dates(&env, main)?; | ||||||
| @@ -184,18 +186,26 @@ impl Index { | |||||||
|     /* documents ids */ |     /* documents ids */ | ||||||
|  |  | ||||||
|     /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. |     /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. | ||||||
|     pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { |     pub(crate) fn put_documents_ids( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         docids: &RoaringBitmap, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) |         self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the internal documents ids. |     /// Returns the internal documents ids. | ||||||
|     pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> { |     pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> { | ||||||
|         Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default()) |         Ok(self | ||||||
|  |             .main | ||||||
|  |             .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)? | ||||||
|  |             .unwrap_or_default()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the number of documents indexed in the database. |     /// Returns the number of documents indexed in the database. | ||||||
|     pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result<u64> { |     pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result<u64> { | ||||||
|         let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; |         let count = | ||||||
|  |             self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; | ||||||
|         Ok(count.unwrap_or_default()) |         Ok(count.unwrap_or_default()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -224,21 +234,30 @@ impl Index { | |||||||
|         &self, |         &self, | ||||||
|         wtxn: &mut RwTxn, |         wtxn: &mut RwTxn, | ||||||
|         external_documents_ids: &ExternalDocumentsIds<'a>, |         external_documents_ids: &ExternalDocumentsIds<'a>, | ||||||
|     ) -> heed::Result<()> |     ) -> heed::Result<()> { | ||||||
|     { |  | ||||||
|         let ExternalDocumentsIds { hard, soft } = external_documents_ids; |         let ExternalDocumentsIds { hard, soft } = external_documents_ids; | ||||||
|         let hard = hard.as_fst().as_bytes(); |         let hard = hard.as_fst().as_bytes(); | ||||||
|         let soft = soft.as_fst().as_bytes(); |         let soft = soft.as_fst().as_bytes(); | ||||||
|         self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; |         self.main.put::<_, Str, ByteSlice>( | ||||||
|         self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; |             wtxn, | ||||||
|  |             main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, | ||||||
|  |             hard, | ||||||
|  |         )?; | ||||||
|  |         self.main.put::<_, Str, ByteSlice>( | ||||||
|  |             wtxn, | ||||||
|  |             main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, | ||||||
|  |             soft, | ||||||
|  |         )?; | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the external documents ids map which associate the external ids |     /// Returns the external documents ids map which associate the external ids | ||||||
|     /// with the internal ids (i.e. `u32`). |     /// with the internal ids (i.e. `u32`). | ||||||
|     pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> { |     pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> { | ||||||
|         let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; |         let hard = | ||||||
|         let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; |             self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; | ||||||
|  |         let soft = | ||||||
|  |             self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; | ||||||
|         let hard = match hard { |         let hard = match hard { | ||||||
|             Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, |             Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, | ||||||
|             None => fst::Map::default().map_data(Cow::Owned)?, |             None => fst::Map::default().map_data(Cow::Owned)?, | ||||||
| @@ -254,42 +273,62 @@ impl Index { | |||||||
|  |  | ||||||
|     /// Writes the fields ids map which associate the documents keys with an internal field id |     /// Writes the fields ids map which associate the documents keys with an internal field id | ||||||
|     /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. |     /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. | ||||||
|     pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { |     pub(crate) fn put_fields_ids_map( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         map: &FieldsIdsMap, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) |         self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the fields ids map which associate the documents keys with an internal field id |     /// Returns the fields ids map which associate the documents keys with an internal field id | ||||||
|     /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. |     /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. | ||||||
|     pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result<FieldsIdsMap> { |     pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result<FieldsIdsMap> { | ||||||
|         Ok(self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>( |         Ok(self | ||||||
|             rtxn, |             .main | ||||||
|             main_key::FIELDS_IDS_MAP_KEY, |             .get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, main_key::FIELDS_IDS_MAP_KEY)? | ||||||
|         )?.unwrap_or_default()) |             .unwrap_or_default()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /* fields distribution */ |     /* fields distribution */ | ||||||
|  |  | ||||||
|     /// Writes the fields distribution which associates every field name with |     /// Writes the fields distribution which associates every field name with | ||||||
|     /// the number of times it occurs in the documents. |     /// the number of times it occurs in the documents. | ||||||
|     pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { |     pub(crate) fn put_fields_distribution( | ||||||
|         self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         distribution: &FieldsDistribution, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|  |         self.main.put::<_, Str, SerdeJson<FieldsDistribution>>( | ||||||
|  |             wtxn, | ||||||
|  |             main_key::FIELDS_DISTRIBUTION_KEY, | ||||||
|  |             distribution, | ||||||
|  |         ) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the fields distribution which associates every field name with |     /// Returns the fields distribution which associates every field name with | ||||||
|     /// the number of times it occurs in the documents. |     /// the number of times it occurs in the documents. | ||||||
|     pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { |     pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { | ||||||
|         Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>( |         Ok(self | ||||||
|             rtxn, |             .main | ||||||
|             main_key::FIELDS_DISTRIBUTION_KEY, |             .get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)? | ||||||
|         )?.unwrap_or_default()) |             .unwrap_or_default()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /* displayed fields */ |     /* displayed fields */ | ||||||
|  |  | ||||||
|     /// Writes the fields that must be displayed in the defined order. |     /// Writes the fields that must be displayed in the defined order. | ||||||
|     /// There must be not be any duplicate field id. |     /// There must be not be any duplicate field id. | ||||||
|     pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { |     pub(crate) fn put_displayed_fields( | ||||||
|         self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         fields: &[&str], | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|  |         self.main.put::<_, Str, SerdeBincode<&[&str]>>( | ||||||
|  |             wtxn, | ||||||
|  |             main_key::DISPLAYED_FIELDS_KEY, | ||||||
|  |             &fields, | ||||||
|  |         ) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Deletes the displayed fields ids, this will make the engine to display |     /// Deletes the displayed fields ids, this will make the engine to display | ||||||
| @@ -313,14 +352,17 @@ impl Index { | |||||||
|                 for name in fields.into_iter() { |                 for name in fields.into_iter() { | ||||||
|                     match fields_ids_map.id(name) { |                     match fields_ids_map.id(name) { | ||||||
|                         Some(field_id) => fields_ids.push(field_id), |                         Some(field_id) => fields_ids.push(field_id), | ||||||
|                         None => return Err(FieldIdMapMissingEntry::FieldName { |                         None => { | ||||||
|                             field_name: name.to_string(), |                             return Err(FieldIdMapMissingEntry::FieldName { | ||||||
|                             process: "Index::displayed_fields_ids", |                                 field_name: name.to_string(), | ||||||
|                         }.into()), |                                 process: "Index::displayed_fields_ids", | ||||||
|  |                             } | ||||||
|  |                             .into()) | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Ok(Some(fields_ids)) |                 Ok(Some(fields_ids)) | ||||||
|             }, |             } | ||||||
|             None => Ok(None), |             None => Ok(None), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -328,8 +370,16 @@ impl Index { | |||||||
|     /* searchable fields */ |     /* searchable fields */ | ||||||
|  |  | ||||||
|     /// Writes the searchable fields, when this list is specified, only these are indexed. |     /// Writes the searchable fields, when this list is specified, only these are indexed. | ||||||
|     pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { |     pub(crate) fn put_searchable_fields( | ||||||
|         self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         fields: &[&str], | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|  |         self.main.put::<_, Str, SerdeBincode<&[&str]>>( | ||||||
|  |             wtxn, | ||||||
|  |             main_key::SEARCHABLE_FIELDS_KEY, | ||||||
|  |             &fields, | ||||||
|  |         ) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Deletes the searchable fields, when no fields are specified, all fields are indexed. |     /// Deletes the searchable fields, when no fields are specified, all fields are indexed. | ||||||
| @@ -352,14 +402,17 @@ impl Index { | |||||||
|                 for name in fields { |                 for name in fields { | ||||||
|                     match fields_ids_map.id(name) { |                     match fields_ids_map.id(name) { | ||||||
|                         Some(field_id) => fields_ids.push(field_id), |                         Some(field_id) => fields_ids.push(field_id), | ||||||
|                         None => return Err(FieldIdMapMissingEntry::FieldName { |                         None => { | ||||||
|                             field_name: name.to_string(), |                             return Err(FieldIdMapMissingEntry::FieldName { | ||||||
|                             process: "Index::searchable_fields_ids", |                                 field_name: name.to_string(), | ||||||
|                         }.into()), |                                 process: "Index::searchable_fields_ids", | ||||||
|  |                             } | ||||||
|  |                             .into()) | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Ok(Some(fields_ids)) |                 Ok(Some(fields_ids)) | ||||||
|             }, |             } | ||||||
|             None => Ok(None), |             None => Ok(None), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -367,7 +420,11 @@ impl Index { | |||||||
|     /* filterable fields */ |     /* filterable fields */ | ||||||
|  |  | ||||||
|     /// Writes the filterable fields names in the database. |     /// Writes the filterable fields names in the database. | ||||||
|     pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> { |     pub(crate) fn put_filterable_fields( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         fields: &HashSet<String>, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) |         self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -378,10 +435,10 @@ impl Index { | |||||||
|  |  | ||||||
|     /// Returns the filterable fields names. |     /// Returns the filterable fields names. | ||||||
|     pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> { |     pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> { | ||||||
|         Ok(self.main.get::<_, Str, SerdeJson<_>>( |         Ok(self | ||||||
|             rtxn, |             .main | ||||||
|             main_key::FILTERABLE_FIELDS_KEY, |             .get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)? | ||||||
|         )?.unwrap_or_default()) |             .unwrap_or_default()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Identical to `filterable_fields`, but returns ids instead. |     /// Identical to `filterable_fields`, but returns ids instead. | ||||||
| @@ -394,11 +451,14 @@ impl Index { | |||||||
|             match fields_ids_map.id(&name) { |             match fields_ids_map.id(&name) { | ||||||
|                 Some(field_id) => { |                 Some(field_id) => { | ||||||
|                     fields_ids.insert(field_id); |                     fields_ids.insert(field_id); | ||||||
|                 }, |                 } | ||||||
|                 None => return Err(FieldIdMapMissingEntry::FieldName { |                 None => { | ||||||
|                     field_name: name, |                     return Err(FieldIdMapMissingEntry::FieldName { | ||||||
|                     process: "Index::filterable_fields_ids", |                         field_name: name, | ||||||
|                 }.into()), |                         process: "Index::filterable_fields_ids", | ||||||
|  |                     } | ||||||
|  |                     .into()) | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -413,9 +473,8 @@ impl Index { | |||||||
|     pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> { |     pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> { | ||||||
|         let filterable_fields = self.filterable_fields(rtxn)?; |         let filterable_fields = self.filterable_fields(rtxn)?; | ||||||
|         let distinct_field = self.distinct_field(rtxn)?; |         let distinct_field = self.distinct_field(rtxn)?; | ||||||
|         let asc_desc_fields = self.criteria(rtxn)? |         let asc_desc_fields = | ||||||
|             .into_iter() |             self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { | ||||||
|             .filter_map(|criterion| match criterion { |  | ||||||
|                 Criterion::Asc(field) | Criterion::Desc(field) => Some(field), |                 Criterion::Asc(field) | Criterion::Desc(field) => Some(field), | ||||||
|                 _otherwise => None, |                 _otherwise => None, | ||||||
|             }); |             }); | ||||||
| @@ -439,11 +498,14 @@ impl Index { | |||||||
|             match fields_ids_map.id(&name) { |             match fields_ids_map.id(&name) { | ||||||
|                 Some(field_id) => { |                 Some(field_id) => { | ||||||
|                     fields_ids.insert(field_id); |                     fields_ids.insert(field_id); | ||||||
|                 }, |                 } | ||||||
|                 None => return Err(FieldIdMapMissingEntry::FieldName { |                 None => { | ||||||
|                     field_name: name, |                     return Err(FieldIdMapMissingEntry::FieldName { | ||||||
|                     process: "Index::faceted_fields_ids", |                         field_name: name, | ||||||
|                 }.into()), |                         process: "Index::faceted_fields_ids", | ||||||
|  |                     } | ||||||
|  |                     .into()) | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -458,8 +520,7 @@ impl Index { | |||||||
|         wtxn: &mut RwTxn, |         wtxn: &mut RwTxn, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         docids: &RoaringBitmap, |         docids: &RoaringBitmap, | ||||||
|     ) -> heed::Result<()> |     ) -> heed::Result<()> { | ||||||
|     { |  | ||||||
|         let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; |         let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||||
|         buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] |         buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||||
|             .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); |             .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||||
| @@ -472,8 +533,7 @@ impl Index { | |||||||
|         &self, |         &self, | ||||||
|         rtxn: &RoTxn, |         rtxn: &RoTxn, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|     ) -> heed::Result<RoaringBitmap> |     ) -> heed::Result<RoaringBitmap> { | ||||||
|     { |  | ||||||
|         let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; |         let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||||
|         buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] |         buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||||
|             .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); |             .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||||
| @@ -490,8 +550,7 @@ impl Index { | |||||||
|         wtxn: &mut RwTxn, |         wtxn: &mut RwTxn, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         docids: &RoaringBitmap, |         docids: &RoaringBitmap, | ||||||
|     ) -> heed::Result<()> |     ) -> heed::Result<()> { | ||||||
|     { |  | ||||||
|         let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; |         let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||||
|         buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] |         buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||||
|             .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); |             .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||||
| @@ -504,8 +563,7 @@ impl Index { | |||||||
|         &self, |         &self, | ||||||
|         rtxn: &RoTxn, |         rtxn: &RoTxn, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|     ) -> heed::Result<RoaringBitmap> |     ) -> heed::Result<RoaringBitmap> { | ||||||
|     { |  | ||||||
|         let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; |         let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||||
|         buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] |         buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||||
|             .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); |             .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||||
| @@ -518,7 +576,11 @@ impl Index { | |||||||
|  |  | ||||||
|     /* distinct field */ |     /* distinct field */ | ||||||
|  |  | ||||||
|     pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { |     pub(crate) fn put_distinct_field( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         distinct_field: &str, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) |         self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -532,7 +594,11 @@ impl Index { | |||||||
|  |  | ||||||
|     /* criteria */ |     /* criteria */ | ||||||
|  |  | ||||||
|     pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { |     pub(crate) fn put_criteria( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         criteria: &[Criterion], | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) |         self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -550,7 +616,11 @@ impl Index { | |||||||
|     /* words fst */ |     /* words fst */ | ||||||
|  |  | ||||||
|     /// Writes the FST which is the words dictionary of the engine. |     /// Writes the FST which is the words dictionary of the engine. | ||||||
|     pub(crate) fn put_words_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { |     pub(crate) fn put_words_fst<A: AsRef<[u8]>>( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         fst: &fst::Set<A>, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) |         self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -564,7 +634,11 @@ impl Index { | |||||||
|  |  | ||||||
|     /* stop words */ |     /* stop words */ | ||||||
|  |  | ||||||
|     pub(crate) fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { |     pub(crate) fn put_stop_words<A: AsRef<[u8]>>( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         fst: &fst::Set<A>, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) |         self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -585,8 +659,7 @@ impl Index { | |||||||
|         &self, |         &self, | ||||||
|         wtxn: &mut RwTxn, |         wtxn: &mut RwTxn, | ||||||
|         synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>, |         synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>, | ||||||
|     ) -> heed::Result<()> |     ) -> heed::Result<()> { | ||||||
|     { |  | ||||||
|         self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) |         self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -595,15 +668,17 @@ impl Index { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> { |     pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> { | ||||||
|         Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default()) |         Ok(self | ||||||
|  |             .main | ||||||
|  |             .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)? | ||||||
|  |             .unwrap_or_default()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn words_synonyms<S: AsRef<str>>( |     pub fn words_synonyms<S: AsRef<str>>( | ||||||
|         &self, |         &self, | ||||||
|         rtxn: &RoTxn, |         rtxn: &RoTxn, | ||||||
|         words: &[S], |         words: &[S], | ||||||
|     ) -> heed::Result<Option<Vec<Vec<String>>>> |     ) -> heed::Result<Option<Vec<Vec<String>>>> { | ||||||
|     { |  | ||||||
|         let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); |         let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); | ||||||
|         Ok(self.synonyms(rtxn)?.remove(&words)) |         Ok(self.synonyms(rtxn)?.remove(&words)) | ||||||
|     } |     } | ||||||
| @@ -611,8 +686,16 @@ impl Index { | |||||||
|     /* words prefixes fst */ |     /* words prefixes fst */ | ||||||
|  |  | ||||||
|     /// Writes the FST which is the words prefixes dictionnary of the engine. |     /// Writes the FST which is the words prefixes dictionnary of the engine. | ||||||
|     pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { |     pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>( | ||||||
|         self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         fst: &fst::Set<A>, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|  |         self.main.put::<_, Str, ByteSlice>( | ||||||
|  |             wtxn, | ||||||
|  |             main_key::WORDS_PREFIXES_FST_KEY, | ||||||
|  |             fst.as_fst().as_bytes(), | ||||||
|  |         ) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the FST which is the words prefixes dictionnary of the engine. |     /// Returns the FST which is the words prefixes dictionnary of the engine. | ||||||
| @@ -637,13 +720,14 @@ impl Index { | |||||||
|     pub fn documents<'t>( |     pub fn documents<'t>( | ||||||
|         &self, |         &self, | ||||||
|         rtxn: &'t RoTxn, |         rtxn: &'t RoTxn, | ||||||
|         ids: impl IntoIterator<Item=DocumentId>, |         ids: impl IntoIterator<Item = DocumentId>, | ||||||
|     ) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> |     ) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> { | ||||||
|     { |  | ||||||
|         let mut documents = Vec::new(); |         let mut documents = Vec::new(); | ||||||
|  |  | ||||||
|         for id in ids { |         for id in ids { | ||||||
|             let kv = self.documents.get(rtxn, &BEU32::new(id))? |             let kv = self | ||||||
|  |                 .documents | ||||||
|  |                 .get(rtxn, &BEU32::new(id))? | ||||||
|                 .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; |                 .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; | ||||||
|             documents.push((id, kv)); |             documents.push((id, kv)); | ||||||
|         } |         } | ||||||
| @@ -673,7 +757,8 @@ impl Index { | |||||||
|  |  | ||||||
|     /// Returns the index creation time. |     /// Returns the index creation time. | ||||||
|     pub fn created_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> { |     pub fn created_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> { | ||||||
|         Ok(self.main |         Ok(self | ||||||
|  |             .main | ||||||
|             .get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::CREATED_AT_KEY)? |             .get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::CREATED_AT_KEY)? | ||||||
|             .ok_or(InternalError::DatabaseMissingEntry { |             .ok_or(InternalError::DatabaseMissingEntry { | ||||||
|                 db_name: db_name::MAIN, |                 db_name: db_name::MAIN, | ||||||
| @@ -683,7 +768,8 @@ impl Index { | |||||||
|  |  | ||||||
|     /// Returns the index last updated time. |     /// Returns the index last updated time. | ||||||
|     pub fn updated_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> { |     pub fn updated_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> { | ||||||
|         Ok(self.main |         Ok(self | ||||||
|  |             .main | ||||||
|             .get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::UPDATED_AT_KEY)? |             .get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::UPDATED_AT_KEY)? | ||||||
|             .ok_or(InternalError::DatabaseMissingEntry { |             .ok_or(InternalError::DatabaseMissingEntry { | ||||||
|                 db_name: db_name::MAIN, |                 db_name: db_name::MAIN, | ||||||
| @@ -691,7 +777,11 @@ impl Index { | |||||||
|             })?) |             })?) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime<Utc>) -> heed::Result<()> { |     pub(crate) fn set_updated_at( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         time: &DateTime<Utc>, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, SerdeJson<DateTime<Utc>>>(wtxn, main_key::UPDATED_AT_KEY, &time) |         self.main.put::<_, Str, SerdeJson<DateTime<Utc>>>(wtxn, main_key::UPDATED_AT_KEY, &time) | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -704,8 +794,8 @@ pub(crate) mod tests { | |||||||
|     use maplit::hashmap; |     use maplit::hashmap; | ||||||
|     use tempfile::TempDir; |     use tempfile::TempDir; | ||||||
|  |  | ||||||
|     use crate::Index; |  | ||||||
|     use crate::update::{IndexDocuments, UpdateFormat}; |     use crate::update::{IndexDocuments, UpdateFormat}; | ||||||
|  |     use crate::Index; | ||||||
|  |  | ||||||
|     pub(crate) struct TempIndex { |     pub(crate) struct TempIndex { | ||||||
|         inner: Index, |         inner: Index, | ||||||
| @@ -728,10 +818,7 @@ pub(crate) mod tests { | |||||||
|             options.map_size(100 * 4096); |             options.map_size(100 * 4096); | ||||||
|             let _tempdir = TempDir::new_in(".").unwrap(); |             let _tempdir = TempDir::new_in(".").unwrap(); | ||||||
|             let inner = Index::new(options, _tempdir.path()).unwrap(); |             let inner = Index::new(options, _tempdir.path()).unwrap(); | ||||||
|             Self { |             Self { inner, _tempdir } | ||||||
|                 inner, |  | ||||||
|                 _tempdir |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -756,10 +843,13 @@ pub(crate) mod tests { | |||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|         let fields_distribution = index.fields_distribution(&rtxn).unwrap(); |         let fields_distribution = index.fields_distribution(&rtxn).unwrap(); | ||||||
|         assert_eq!(fields_distribution, hashmap! { |         assert_eq!( | ||||||
|             "id".to_string() => 2, |             fields_distribution, | ||||||
|             "name".to_string() => 2, |             hashmap! { | ||||||
|             "age".to_string() => 1, |                 "id".to_string() => 2, | ||||||
|         }); |                 "name".to_string() => 2, | ||||||
|  |                 "age".to_string() => 1, | ||||||
|  |             } | ||||||
|  |         ); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,14 +1,15 @@ | |||||||
| #[macro_use] extern crate pest_derive; | #[macro_use] | ||||||
|  | extern crate pest_derive; | ||||||
|  |  | ||||||
| mod criterion; | mod criterion; | ||||||
| mod error; | mod error; | ||||||
| mod external_documents_ids; | mod external_documents_ids; | ||||||
| mod fields_ids_map; |  | ||||||
| mod search; |  | ||||||
| pub mod facet; | pub mod facet; | ||||||
|  | mod fields_ids_map; | ||||||
| pub mod heed_codec; | pub mod heed_codec; | ||||||
| pub mod index; | pub mod index; | ||||||
| pub mod proximity; | pub mod proximity; | ||||||
|  | mod search; | ||||||
| pub mod tree_level; | pub mod tree_level; | ||||||
| pub mod update; | pub mod update; | ||||||
|  |  | ||||||
| @@ -20,15 +21,17 @@ use std::result::Result as StdResult; | |||||||
| use fxhash::{FxHasher32, FxHasher64}; | use fxhash::{FxHasher32, FxHasher64}; | ||||||
| use serde_json::{Map, Value}; | use serde_json::{Map, Value}; | ||||||
|  |  | ||||||
| pub use self::criterion::{Criterion, default_criteria}; | pub use self::criterion::{default_criteria, Criterion}; | ||||||
| pub use self::error::Error; | pub use self::error::Error; | ||||||
| pub use self::external_documents_ids::ExternalDocumentsIds; | pub use self::external_documents_ids::ExternalDocumentsIds; | ||||||
| pub use self::fields_ids_map::FieldsIdsMap; | pub use self::fields_ids_map::FieldsIdsMap; | ||||||
| pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; | pub use self::heed_codec::{ | ||||||
| pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; |     BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, | ||||||
| pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; |     CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, | ||||||
|  |     RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, | ||||||
|  | }; | ||||||
| pub use self::index::Index; | pub use self::index::Index; | ||||||
| pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; | pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult}; | ||||||
| pub use self::tree_level::TreeLevel; | pub use self::tree_level::TreeLevel; | ||||||
|  |  | ||||||
| pub type Result<T> = std::result::Result<T, error::Error>; | pub type Result<T> = std::result::Result<T, error::Error>; | ||||||
| @@ -54,9 +57,9 @@ pub fn obkv_to_json( | |||||||
|     displayed_fields: &[FieldId], |     displayed_fields: &[FieldId], | ||||||
|     fields_ids_map: &FieldsIdsMap, |     fields_ids_map: &FieldsIdsMap, | ||||||
|     obkv: obkv::KvReader, |     obkv: obkv::KvReader, | ||||||
| ) -> Result<Map<String, Value>> | ) -> Result<Map<String, Value>> { | ||||||
| { |     displayed_fields | ||||||
|     displayed_fields.iter() |         .iter() | ||||||
|         .copied() |         .copied() | ||||||
|         .flat_map(|id| obkv.get(id).map(|value| (id, value))) |         .flat_map(|id| obkv.get(id).map(|value| (id, value))) | ||||||
|         .map(|(id, value)| { |         .map(|(id, value)| { | ||||||
| @@ -72,7 +75,6 @@ pub fn obkv_to_json( | |||||||
|  |  | ||||||
| /// Transform a JSON value into a string that can be indexed. | /// Transform a JSON value into a string that can be indexed. | ||||||
| pub fn json_to_string(value: &Value) -> Option<String> { | pub fn json_to_string(value: &Value) -> Option<String> { | ||||||
|  |  | ||||||
|     fn inner(value: &Value, output: &mut String) -> bool { |     fn inner(value: &Value, output: &mut String) -> bool { | ||||||
|         use std::fmt::Write; |         use std::fmt::Write; | ||||||
|         match value { |         match value { | ||||||
| @@ -90,7 +92,7 @@ pub fn json_to_string(value: &Value) -> Option<String> { | |||||||
|                 } |                 } | ||||||
|                 // check that at least one value was written |                 // check that at least one value was written | ||||||
|                 count != 0 |                 count != 0 | ||||||
|             }, |             } | ||||||
|             Value::Object(object) => { |             Value::Object(object) => { | ||||||
|                 let mut buffer = String::new(); |                 let mut buffer = String::new(); | ||||||
|                 let mut count = 0; |                 let mut count = 0; | ||||||
| @@ -107,7 +109,7 @@ pub fn json_to_string(value: &Value) -> Option<String> { | |||||||
|                 } |                 } | ||||||
|                 // check that at least one value was written |                 // check that at least one value was written | ||||||
|                 count != 0 |                 count != 0 | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -121,9 +123,10 @@ pub fn json_to_string(value: &Value) -> Option<String> { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |  | ||||||
|     use serde_json::json; |     use serde_json::json; | ||||||
|  |  | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn json_to_string_object() { |     fn json_to_string_object() { | ||||||
|         let value = json!({ |         let value = json!({ | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| use std::cmp; | use std::cmp; | ||||||
|  |  | ||||||
| use crate::{Attribute, Position}; | use crate::{Attribute, Position}; | ||||||
|  |  | ||||||
| const ONE_ATTRIBUTE: u32 = 1000; | const ONE_ATTRIBUTE: u32 = 1000; | ||||||
| @@ -15,8 +16,11 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { | |||||||
| pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { | pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { | ||||||
|     let (lhs_attr, lhs_index) = extract_position(lhs); |     let (lhs_attr, lhs_index) = extract_position(lhs); | ||||||
|     let (rhs_attr, rhs_index) = extract_position(rhs); |     let (rhs_attr, rhs_index) = extract_position(rhs); | ||||||
|     if lhs_attr != rhs_attr { MAX_DISTANCE } |     if lhs_attr != rhs_attr { | ||||||
|     else { index_proximity(lhs_index, rhs_index) } |         MAX_DISTANCE | ||||||
|  |     } else { | ||||||
|  |         index_proximity(lhs_index, rhs_index) | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn extract_position(position: Position) -> (Attribute, Position) { | pub fn extract_position(position: Position) -> (Attribute, Position) { | ||||||
|   | |||||||
| @@ -5,12 +5,12 @@ use log::debug; | |||||||
| use ordered_float::OrderedFloat; | use ordered_float::OrderedFloat; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use super::{Criterion, CriterionParameters, CriterionResult}; | ||||||
| use crate::error::FieldIdMapMissingEntry; | use crate::error::FieldIdMapMissingEntry; | ||||||
| use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; | use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; | ||||||
| use crate::search::facet::FacetIter; | use crate::search::facet::FacetIter; | ||||||
| use crate::search::query_tree::Operation; | use crate::search::query_tree::Operation; | ||||||
| use crate::{FieldId, Index, Result}; | use crate::{FieldId, Index, Result}; | ||||||
| use super::{Criterion, CriterionParameters, CriterionResult}; |  | ||||||
|  |  | ||||||
| /// Threshold on the number of candidates that will make | /// Threshold on the number of candidates that will make | ||||||
| /// the system to choose between one algorithm or another. | /// the system to choose between one algorithm or another. | ||||||
| @@ -57,9 +57,8 @@ impl<'t> AscDesc<'t> { | |||||||
|         ascending: bool, |         ascending: bool, | ||||||
|     ) -> Result<Self> { |     ) -> Result<Self> { | ||||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; |         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||||
|         let field_id = fields_ids_map |         let field_id = | ||||||
|             .id(&field_name) |             fields_ids_map.id(&field_name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { | ||||||
|             .ok_or_else(|| FieldIdMapMissingEntry::FieldName { |  | ||||||
|                 field_name: field_name.clone(), |                 field_name: field_name.clone(), | ||||||
|                 process: "AscDesc::new", |                 process: "AscDesc::new", | ||||||
|             })?; |             })?; | ||||||
| @@ -101,44 +100,47 @@ impl<'t> Criterion for AscDesc<'t> { | |||||||
|                         filtered_candidates: None, |                         filtered_candidates: None, | ||||||
|                         bucket_candidates: Some(take(&mut self.bucket_candidates)), |                         bucket_candidates: Some(take(&mut self.bucket_candidates)), | ||||||
|                     })); |                     })); | ||||||
|                 }, |                 } | ||||||
|                 None => { |                 None => match self.parent.next(params)? { | ||||||
|                     match self.parent.next(params)? { |                     Some(CriterionResult { | ||||||
|                         Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { |                         query_tree, | ||||||
|                             self.query_tree = query_tree; |                         candidates, | ||||||
|                             let mut candidates = match (&self.query_tree, candidates) { |                         filtered_candidates, | ||||||
|                                 (_, Some(candidates)) => candidates, |                         bucket_candidates, | ||||||
|                                 (Some(qt), None) => { |                     }) => { | ||||||
|                                     let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; |                         self.query_tree = query_tree; | ||||||
|                                     resolve_query_tree(&context, qt, params.wdcache)? |                         let mut candidates = match (&self.query_tree, candidates) { | ||||||
|                                 }, |                             (_, Some(candidates)) => candidates, | ||||||
|                                 (None, None) => self.index.documents_ids(self.rtxn)?, |                             (Some(qt), None) => { | ||||||
|                             }; |                                 let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; | ||||||
|  |                                 resolve_query_tree(&context, qt, params.wdcache)? | ||||||
|                             if let Some(filtered_candidates) = filtered_candidates { |  | ||||||
|                                 candidates &= filtered_candidates; |  | ||||||
|                             } |                             } | ||||||
|  |                             (None, None) => self.index.documents_ids(self.rtxn)?, | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|                             match bucket_candidates { |                         if let Some(filtered_candidates) = filtered_candidates { | ||||||
|                                 Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, |                             candidates &= filtered_candidates; | ||||||
|                                 None => self.bucket_candidates |= &candidates, |                         } | ||||||
|                             } |  | ||||||
|  |  | ||||||
|                             if candidates.is_empty() { |                         match bucket_candidates { | ||||||
|                                 continue; |                             Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, | ||||||
|                             } |                             None => self.bucket_candidates |= &candidates, | ||||||
|  |                         } | ||||||
|  |  | ||||||
|                             self.allowed_candidates = &candidates - params.excluded_candidates; |                         if candidates.is_empty() { | ||||||
|                             self.candidates = facet_ordered( |                             continue; | ||||||
|                                 self.index, |                         } | ||||||
|                                 self.rtxn, |  | ||||||
|                                 self.field_id, |                         self.allowed_candidates = &candidates - params.excluded_candidates; | ||||||
|                                 self.ascending, |                         self.candidates = facet_ordered( | ||||||
|                                 candidates & &self.faceted_candidates, |                             self.index, | ||||||
|                             )?; |                             self.rtxn, | ||||||
|                         }, |                             self.field_id, | ||||||
|                         None => return Ok(None), |                             self.ascending, | ||||||
|  |                             candidates & &self.faceted_candidates, | ||||||
|  |                         )?; | ||||||
|                     } |                     } | ||||||
|  |                     None => return Ok(None), | ||||||
|                 }, |                 }, | ||||||
|                 Some(mut candidates) => { |                 Some(mut candidates) => { | ||||||
|                     candidates -= params.excluded_candidates; |                     candidates -= params.excluded_candidates; | ||||||
| @@ -170,11 +172,8 @@ fn facet_ordered<'t>( | |||||||
|         let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; |         let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; | ||||||
|         Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>) |         Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>) | ||||||
|     } else { |     } else { | ||||||
|         let facet_fn = if ascending { |         let facet_fn = | ||||||
|             FacetIter::new_reducing |             if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing }; | ||||||
|         } else { |  | ||||||
|             FacetIter::new_reverse_reducing |  | ||||||
|         }; |  | ||||||
|         let iter = facet_fn(rtxn, index, field_id, candidates)?; |         let iter = facet_fn(rtxn, index, field_id, candidates)?; | ||||||
|         Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) |         Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) | ||||||
|     } |     } | ||||||
| @@ -194,9 +193,7 @@ fn iterative_facet_ordered_iter<'t>( | |||||||
|     for docid in candidates.iter() { |     for docid in candidates.iter() { | ||||||
|         let left = (field_id, docid, f64::MIN); |         let left = (field_id, docid, f64::MIN); | ||||||
|         let right = (field_id, docid, f64::MAX); |         let right = (field_id, docid, f64::MAX); | ||||||
|         let mut iter = index |         let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; | ||||||
|             .field_id_docid_facet_f64s |  | ||||||
|             .range(rtxn, &(left..=right))?; |  | ||||||
|         let entry = if ascending { iter.next() } else { iter.last() }; |         let entry = if ascending { iter.next() } else { iter.last() }; | ||||||
|         if let Some(((_, _, value), ())) = entry.transpose()? { |         if let Some(((_, _, value), ())) = entry.transpose()? { | ||||||
|             docids_values.push((docid, OrderedFloat(value))); |             docids_values.push((docid, OrderedFloat(value))); | ||||||
|   | |||||||
| @@ -1,15 +1,16 @@ | |||||||
| use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; | use std::borrow::Cow; | ||||||
| use std::collections::{BTreeMap, HashMap, btree_map}; | use std::cmp::{self, Ordering}; | ||||||
| use std::collections::binary_heap::PeekMut; | use std::collections::binary_heap::PeekMut; | ||||||
|  | use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap}; | ||||||
| use std::mem::take; | use std::mem::take; | ||||||
|  |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::{TreeLevel, Result, search::build_dfa}; | use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; | ||||||
| use crate::search::criteria::Query; | use crate::search::criteria::Query; | ||||||
| use crate::search::query_tree::{Operation, QueryKind}; | use crate::search::query_tree::{Operation, QueryKind}; | ||||||
| use crate::search::{word_derivations, WordDerivationsCache}; | use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; | ||||||
| use super::{Criterion, CriterionParameters, CriterionResult, Context, resolve_query_tree}; | use crate::{Result, TreeLevel}; | ||||||
|  |  | ||||||
| /// To be able to divide integers by the number of words in the query | /// To be able to divide integers by the number of words in the query | ||||||
| /// we want to find a multiplier that allow us to divide by any number between 1 and 10. | /// we want to find a multiplier that allow us to divide by any number between 1 and 10. | ||||||
| @@ -63,15 +64,19 @@ impl<'t> Criterion for Attribute<'t> { | |||||||
|                         filtered_candidates: None, |                         filtered_candidates: None, | ||||||
|                         bucket_candidates: Some(take(&mut self.bucket_candidates)), |                         bucket_candidates: Some(take(&mut self.bucket_candidates)), | ||||||
|                     })); |                     })); | ||||||
|                 }, |                 } | ||||||
|                 Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { |                 Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { | ||||||
|                     let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { |                     let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { | ||||||
|                         let current_buckets = match self.current_buckets.as_mut() { |                         let current_buckets = match self.current_buckets.as_mut() { | ||||||
|                             Some(current_buckets) => current_buckets, |                             Some(current_buckets) => current_buckets, | ||||||
|                             None => { |                             None => { | ||||||
|                                 let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?; |                                 let new_buckets = linear_compute_candidates( | ||||||
|  |                                     self.ctx, | ||||||
|  |                                     &flattened_query_tree, | ||||||
|  |                                     &allowed_candidates, | ||||||
|  |                                 )?; | ||||||
|                                 self.current_buckets.get_or_insert(new_buckets.into_iter()) |                                 self.current_buckets.get_or_insert(new_buckets.into_iter()) | ||||||
|                             }, |                             } | ||||||
|                         }; |                         }; | ||||||
|  |  | ||||||
|                         match current_buckets.next() { |                         match current_buckets.next() { | ||||||
| @@ -83,10 +88,15 @@ impl<'t> Criterion for Attribute<'t> { | |||||||
|                                     filtered_candidates: None, |                                     filtered_candidates: None, | ||||||
|                                     bucket_candidates: Some(take(&mut self.bucket_candidates)), |                                     bucket_candidates: Some(take(&mut self.bucket_candidates)), | ||||||
|                                 })); |                                 })); | ||||||
|                             }, |                             } | ||||||
|                         } |                         } | ||||||
|                     } else { |                     } else { | ||||||
|                         match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? { |                         match set_compute_candidates( | ||||||
|  |                             self.ctx, | ||||||
|  |                             &flattened_query_tree, | ||||||
|  |                             &allowed_candidates, | ||||||
|  |                             params.wdcache, | ||||||
|  |                         )? { | ||||||
|                             Some(candidates) => candidates, |                             Some(candidates) => candidates, | ||||||
|                             None => { |                             None => { | ||||||
|                                 return Ok(Some(CriterionResult { |                                 return Ok(Some(CriterionResult { | ||||||
| @@ -95,13 +105,14 @@ impl<'t> Criterion for Attribute<'t> { | |||||||
|                                     filtered_candidates: None, |                                     filtered_candidates: None, | ||||||
|                                     bucket_candidates: Some(take(&mut self.bucket_candidates)), |                                     bucket_candidates: Some(take(&mut self.bucket_candidates)), | ||||||
|                                 })); |                                 })); | ||||||
|                             }, |                             } | ||||||
|                         } |                         } | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
|                     allowed_candidates -= &found_candidates; |                     allowed_candidates -= &found_candidates; | ||||||
|  |  | ||||||
|                     self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); |                     self.state = | ||||||
|  |                         Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); | ||||||
|  |  | ||||||
|                     return Ok(Some(CriterionResult { |                     return Ok(Some(CriterionResult { | ||||||
|                         query_tree: Some(query_tree), |                         query_tree: Some(query_tree), | ||||||
| @@ -109,39 +120,50 @@ impl<'t> Criterion for Attribute<'t> { | |||||||
|                         filtered_candidates: None, |                         filtered_candidates: None, | ||||||
|                         bucket_candidates: Some(take(&mut self.bucket_candidates)), |                         bucket_candidates: Some(take(&mut self.bucket_candidates)), | ||||||
|                     })); |                     })); | ||||||
|                 }, |                 } | ||||||
|                 None => { |                 None => match self.parent.next(params)? { | ||||||
|                     match self.parent.next(params)? { |                     Some(CriterionResult { | ||||||
|                         Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { |                         query_tree: Some(query_tree), | ||||||
|                             let mut candidates = match candidates { |                         candidates, | ||||||
|                                 Some(candidates) => candidates, |                         filtered_candidates, | ||||||
|                                 None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, |                         bucket_candidates, | ||||||
|                             }; |                     }) => { | ||||||
|  |                         let mut candidates = match candidates { | ||||||
|                             if let Some(filtered_candidates) = filtered_candidates { |                             Some(candidates) => candidates, | ||||||
|                                 candidates &= filtered_candidates; |                             None => { | ||||||
|  |                                 resolve_query_tree(self.ctx, &query_tree, params.wdcache)? | ||||||
|  |                                     - params.excluded_candidates | ||||||
|                             } |                             } | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|                             let flattened_query_tree = flatten_query_tree(&query_tree); |                         if let Some(filtered_candidates) = filtered_candidates { | ||||||
|  |                             candidates &= filtered_candidates; | ||||||
|  |                         } | ||||||
|  |  | ||||||
|                             match bucket_candidates { |                         let flattened_query_tree = flatten_query_tree(&query_tree); | ||||||
|                                 Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, |  | ||||||
|                                 None => self.bucket_candidates |= &candidates, |  | ||||||
|                             } |  | ||||||
|  |  | ||||||
|                             self.state = Some((query_tree, flattened_query_tree, candidates)); |                         match bucket_candidates { | ||||||
|                             self.current_buckets = None; |                             Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, | ||||||
|                         }, |                             None => self.bucket_candidates |= &candidates, | ||||||
|                         Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { |                         } | ||||||
|                             return Ok(Some(CriterionResult { |  | ||||||
|                                 query_tree: None, |                         self.state = Some((query_tree, flattened_query_tree, candidates)); | ||||||
|                                 candidates, |                         self.current_buckets = None; | ||||||
|                                 filtered_candidates, |  | ||||||
|                                 bucket_candidates, |  | ||||||
|                             })); |  | ||||||
|                         }, |  | ||||||
|                         None => return Ok(None), |  | ||||||
|                     } |                     } | ||||||
|  |                     Some(CriterionResult { | ||||||
|  |                         query_tree: None, | ||||||
|  |                         candidates, | ||||||
|  |                         filtered_candidates, | ||||||
|  |                         bucket_candidates, | ||||||
|  |                     }) => { | ||||||
|  |                         return Ok(Some(CriterionResult { | ||||||
|  |                             query_tree: None, | ||||||
|  |                             candidates, | ||||||
|  |                             filtered_candidates, | ||||||
|  |                             bucket_candidates, | ||||||
|  |                         })); | ||||||
|  |                     } | ||||||
|  |                     None => return Ok(None), | ||||||
|                 }, |                 }, | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -152,7 +174,9 @@ impl<'t> Criterion for Attribute<'t> { | |||||||
| /// it will begin at the first non-empty interval and will return every interval without | /// it will begin at the first non-empty interval and will return every interval without | ||||||
| /// jumping over empty intervals. | /// jumping over empty intervals. | ||||||
| struct WordLevelIterator<'t, 'q> { | struct WordLevelIterator<'t, 'q> { | ||||||
|     inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>, |     inner: Box< | ||||||
|  |         dyn Iterator<Item = heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't, | ||||||
|  |     >, | ||||||
|     level: TreeLevel, |     level: TreeLevel, | ||||||
|     interval_size: u32, |     interval_size: u32, | ||||||
|     word: Cow<'q, str>, |     word: Cow<'q, str>, | ||||||
| @@ -162,49 +186,80 @@ struct WordLevelIterator<'t, 'q> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t, 'q> WordLevelIterator<'t, 'q> { | impl<'t, 'q> WordLevelIterator<'t, 'q> { | ||||||
|     fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> { |     fn new( | ||||||
|  |         ctx: &'t dyn Context<'t>, | ||||||
|  |         word: Cow<'q, str>, | ||||||
|  |         in_prefix_cache: bool, | ||||||
|  |     ) -> heed::Result<Option<Self>> { | ||||||
|         match ctx.word_position_last_level(&word, in_prefix_cache)? { |         match ctx.word_position_last_level(&word, in_prefix_cache)? { | ||||||
|             Some(level) =>  { |             Some(level) => { | ||||||
|                 let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32); |                 let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32); | ||||||
|                 let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; |                 let inner = | ||||||
|                 Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) |                     ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; | ||||||
|             }, |                 Ok(Some(Self { | ||||||
|  |                     inner, | ||||||
|  |                     level, | ||||||
|  |                     interval_size, | ||||||
|  |                     word, | ||||||
|  |                     in_prefix_cache, | ||||||
|  |                     inner_next: None, | ||||||
|  |                     current_interval: None, | ||||||
|  |                 })) | ||||||
|  |             } | ||||||
|             None => Ok(None), |             None => Ok(None), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> { |     fn dig( | ||||||
|  |         &self, | ||||||
|  |         ctx: &'t dyn Context<'t>, | ||||||
|  |         level: &TreeLevel, | ||||||
|  |         left_interval: Option<u32>, | ||||||
|  |     ) -> heed::Result<Self> { | ||||||
|         let level = *level.min(&self.level); |         let level = *level.min(&self.level); | ||||||
|         let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32); |         let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32); | ||||||
|         let word = self.word.clone(); |         let word = self.word.clone(); | ||||||
|         let in_prefix_cache = self.in_prefix_cache; |         let in_prefix_cache = self.in_prefix_cache; | ||||||
|         let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; |         let inner = | ||||||
|  |             ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; | ||||||
|  |  | ||||||
|         Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) |         Ok(Self { | ||||||
|  |             inner, | ||||||
|  |             level, | ||||||
|  |             interval_size, | ||||||
|  |             word, | ||||||
|  |             in_prefix_cache, | ||||||
|  |             inner_next: None, | ||||||
|  |             current_interval: None, | ||||||
|  |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { |     fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { | ||||||
|         fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } |         fn is_next_interval(last_right: u32, next_left: u32) -> bool { | ||||||
|  |             last_right + 1 == next_left | ||||||
|  |         } | ||||||
|  |  | ||||||
|         let inner_next = match self.inner_next.take() { |         let inner_next = match self.inner_next.take() { | ||||||
|             Some(inner_next) => Some(inner_next), |             Some(inner_next) => Some(inner_next), | ||||||
|             None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), |             None => self | ||||||
|  |                 .inner | ||||||
|  |                 .next() | ||||||
|  |                 .transpose()? | ||||||
|  |                 .map(|((_, _, left, right), docids)| (left, right, docids)), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         match inner_next { |         match inner_next { | ||||||
|             Some((left, right, docids)) => { |             Some((left, right, docids)) => match self.current_interval { | ||||||
|                 match self.current_interval { |                 Some((last_left, last_right)) if !is_next_interval(last_right, left) => { | ||||||
|                     Some((last_left, last_right)) if !is_next_interval(last_right, left) => { |                     let blank_left = last_left + self.interval_size; | ||||||
|                         let blank_left = last_left + self.interval_size; |                     let blank_right = last_right + self.interval_size; | ||||||
|                         let blank_right = last_right + self.interval_size; |                     self.current_interval = Some((blank_left, blank_right)); | ||||||
|                         self.current_interval = Some((blank_left, blank_right)); |                     self.inner_next = Some((left, right, docids)); | ||||||
|                         self.inner_next = Some((left, right, docids)); |                     Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) | ||||||
|                         Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) |                 } | ||||||
|                     }, |                 _ => { | ||||||
|                     _ => { |                     self.current_interval = Some((left, right)); | ||||||
|                         self.current_interval = Some((left, right)); |                     Ok(Some((left, right, docids))) | ||||||
|                         Ok(Some((left, right, docids))) |  | ||||||
|                     } |  | ||||||
|                 } |                 } | ||||||
|             }, |             }, | ||||||
|             None => Ok(None), |             None => Ok(None), | ||||||
| @@ -228,30 +283,37 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { | |||||||
|         ctx: &'t dyn Context<'t>, |         ctx: &'t dyn Context<'t>, | ||||||
|         queries: &'q [Query], |         queries: &'q [Query], | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<Option<Self>> |     ) -> Result<Option<Self>> { | ||||||
|     { |  | ||||||
|         let mut inner = Vec::with_capacity(queries.len()); |         let mut inner = Vec::with_capacity(queries.len()); | ||||||
|         for query in queries { |         for query in queries { | ||||||
|             match &query.kind { |             match &query.kind { | ||||||
|                 QueryKind::Exact { word, .. } => { |                 QueryKind::Exact { word, .. } => { | ||||||
|                     if !query.prefix || ctx.in_prefix_cache(&word) { |                     if !query.prefix || ctx.in_prefix_cache(&word) { | ||||||
|                         let word = Cow::Borrowed(query.kind.word()); |                         let word = Cow::Borrowed(query.kind.word()); | ||||||
|                         if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { |                         if let Some(word_level_iterator) = | ||||||
|  |                             WordLevelIterator::new(ctx, word, query.prefix)? | ||||||
|  |                         { | ||||||
|                             inner.push(word_level_iterator); |                             inner.push(word_level_iterator); | ||||||
|                         } |                         } | ||||||
|                     } else { |                     } else { | ||||||
|                         for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { |                         for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? | ||||||
|  |                         { | ||||||
|                             let word = Cow::Owned(word.to_owned()); |                             let word = Cow::Owned(word.to_owned()); | ||||||
|                             if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { |                             if let Some(word_level_iterator) = | ||||||
|  |                                 WordLevelIterator::new(ctx, word, false)? | ||||||
|  |                             { | ||||||
|                                 inner.push(word_level_iterator); |                                 inner.push(word_level_iterator); | ||||||
|                             } |                             } | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 }, |                 } | ||||||
|                 QueryKind::Tolerant { typo, word } => { |                 QueryKind::Tolerant { typo, word } => { | ||||||
|                     for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { |                     for (word, _) in | ||||||
|  |                         word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? | ||||||
|  |                     { | ||||||
|                         let word = Cow::Owned(word.to_owned()); |                         let word = Cow::Owned(word.to_owned()); | ||||||
|                         if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { |                         if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? | ||||||
|  |                         { | ||||||
|                             inner.push(word_level_iterator); |                             inner.push(word_level_iterator); | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
| @@ -284,17 +346,28 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { | |||||||
|             Some(parent) => { |             Some(parent) => { | ||||||
|                 let parent = parent.dig(ctx)?; |                 let parent = parent.dig(ctx)?; | ||||||
|                 (parent.level.min(self.level), Some(Box::new(parent))) |                 (parent.level.min(self.level), Some(Box::new(parent))) | ||||||
|             }, |             } | ||||||
|             None => (self.level.saturating_sub(1), None), |             None => (self.level.saturating_sub(1), None), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); |         let left_interval = self | ||||||
|  |             .accumulator | ||||||
|  |             .get(self.interval_to_skip) | ||||||
|  |             .map(|opt| opt.as_ref().map(|(left, _, _)| *left)) | ||||||
|  |             .flatten(); | ||||||
|         let mut inner = Vec::with_capacity(self.inner.len()); |         let mut inner = Vec::with_capacity(self.inner.len()); | ||||||
|         for word_level_iterator in self.inner.iter() { |         for word_level_iterator in self.inner.iter() { | ||||||
|             inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); |             inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) |         Ok(Self { | ||||||
|  |             parent, | ||||||
|  |             inner, | ||||||
|  |             level, | ||||||
|  |             accumulator: vec![], | ||||||
|  |             parent_accumulator: vec![], | ||||||
|  |             interval_to_skip: 0, | ||||||
|  |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { |     fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { | ||||||
| @@ -305,12 +378,12 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { | |||||||
|             let wli_u8_level = Into::<u8>::into(wli.level); |             let wli_u8_level = Into::<u8>::into(wli.level); | ||||||
|             let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); |             let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); | ||||||
|             for _ in 0..accumulated_count { |             for _ in 0..accumulated_count { | ||||||
|                 if let Some((next_left, _, next_docids)) =  wli.next()? { |                 if let Some((next_left, _, next_docids)) = wli.next()? { | ||||||
|                     accumulated = match accumulated.take(){ |                     accumulated = match accumulated.take() { | ||||||
|                         Some((acc_left, acc_right, mut acc_docids)) => { |                         Some((acc_left, acc_right, mut acc_docids)) => { | ||||||
|                             acc_docids |= next_docids; |                             acc_docids |= next_docids; | ||||||
|                             Some((acc_left, acc_right, acc_docids)) |                             Some((acc_left, acc_right, acc_docids)) | ||||||
|                         }, |                         } | ||||||
|                         None => Some((next_left, next_left + interval_size, next_docids)), |                         None => Some((next_left, next_left + interval_size, next_docids)), | ||||||
|                     }; |                     }; | ||||||
|                 } |                 } | ||||||
| @@ -322,7 +395,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { | |||||||
|  |  | ||||||
|     /// return the next meta-interval created from inner WordLevelIterators, |     /// return the next meta-interval created from inner WordLevelIterators, | ||||||
|     /// and from eventual chainned QueryLevelIterator. |     /// and from eventual chainned QueryLevelIterator. | ||||||
|     fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { |     fn next( | ||||||
|  |         &mut self, | ||||||
|  |         allowed_candidates: &RoaringBitmap, | ||||||
|  |         tree_level: TreeLevel, | ||||||
|  |     ) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { | ||||||
|         let parent_result = match self.parent.as_mut() { |         let parent_result = match self.parent.as_mut() { | ||||||
|             Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), |             Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), | ||||||
|             None => None, |             None => None, | ||||||
| @@ -335,22 +412,30 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { | |||||||
|                     &self.parent_accumulator, |                     &self.parent_accumulator, | ||||||
|                     &self.accumulator, |                     &self.accumulator, | ||||||
|                     self.interval_to_skip, |                     self.interval_to_skip, | ||||||
|                     allowed_candidates |                     allowed_candidates, | ||||||
|                 ); |                 ); | ||||||
|                 self.accumulator.push(inner_next); |                 self.accumulator.push(inner_next); | ||||||
|                 self.parent_accumulator.push(parent_next); |                 self.parent_accumulator.push(parent_next); | ||||||
|                 let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; |                 let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; | ||||||
|  |  | ||||||
|                 for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { |                 for current in self | ||||||
|  |                     .accumulator | ||||||
|  |                     .iter() | ||||||
|  |                     .rev() | ||||||
|  |                     .zip(self.parent_accumulator.iter()) | ||||||
|  |                     .skip(self.interval_to_skip) | ||||||
|  |                 { | ||||||
|                     if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { |                     if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { | ||||||
|                         match merged_interval.as_mut() { |                         match merged_interval.as_mut() { | ||||||
|                             Some((_, _, merged_docids)) => *merged_docids |= a & b, |                             Some((_, _, merged_docids)) => *merged_docids |= a & b, | ||||||
|                             None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), |                             None => { | ||||||
|  |                                 merged_interval = Some((left_a + left_b, right_a + right_b, a & b)) | ||||||
|  |                             } | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Ok(merged_interval) |                 Ok(merged_interval) | ||||||
|             }, |             } | ||||||
|             None => { |             None => { | ||||||
|                 let level = self.level; |                 let level = self.level; | ||||||
|                 match self.inner_next(level)? { |                 match self.inner_next(level)? { | ||||||
| @@ -358,12 +443,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { | |||||||
|                         self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; |                         self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; | ||||||
|                         candidates &= allowed_candidates; |                         candidates &= allowed_candidates; | ||||||
|                         Ok(Some((left, right, candidates))) |                         Ok(Some((left, right, candidates))) | ||||||
|  |                     } | ||||||
|                     }, |  | ||||||
|                     None => { |                     None => { | ||||||
|                         self.accumulator = vec![None]; |                         self.accumulator = vec![None]; | ||||||
|                         Ok(None) |                         Ok(None) | ||||||
|                     }, |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -379,16 +463,18 @@ fn interval_to_skip( | |||||||
|     already_skiped: usize, |     already_skiped: usize, | ||||||
|     allowed_candidates: &RoaringBitmap, |     allowed_candidates: &RoaringBitmap, | ||||||
| ) -> usize { | ) -> usize { | ||||||
|     parent_accumulator.iter() |     parent_accumulator | ||||||
|  |         .iter() | ||||||
|         .zip(current_accumulator.iter()) |         .zip(current_accumulator.iter()) | ||||||
|         .skip(already_skiped) |         .skip(already_skiped) | ||||||
|         .take_while(|(parent, current)| { |         .take_while(|(parent, current)| { | ||||||
|             let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); |             let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); | ||||||
|             let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); |             let skip_current = current | ||||||
|  |                 .as_ref() | ||||||
|  |                 .map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); | ||||||
|             skip_parent && skip_current |             skip_parent && skip_current | ||||||
|         }) |         }) | ||||||
|         .count() |         .count() | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, | /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, | ||||||
| @@ -410,7 +496,7 @@ impl<'t, 'q> Branch<'t, 'q> { | |||||||
|                 self.last_result = last_result; |                 self.last_result = last_result; | ||||||
|                 self.tree_level = tree_level; |                 self.tree_level = tree_level; | ||||||
|                 Ok(true) |                 Ok(true) | ||||||
|             }, |             } | ||||||
|             None => Ok(false), |             None => Ok(false), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -429,7 +515,7 @@ impl<'t, 'q> Branch<'t, 'q> { | |||||||
|         let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); |         let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); | ||||||
|         let (left, right, _) = self.last_result; |         let (left, right, _) = self.last_result; | ||||||
|  |  | ||||||
|         self.last_result = (left + interval_size,  right + interval_size, RoaringBitmap::new()); |         self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// return the score of the current inner interval. |     /// return the score of the current inner interval. | ||||||
| @@ -477,31 +563,31 @@ fn initialize_query_level_iterators<'t, 'q>( | |||||||
|     allowed_candidates: &RoaringBitmap, |     allowed_candidates: &RoaringBitmap, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<BinaryHeap<Branch<'t, 'q>>> { | ) -> Result<BinaryHeap<Branch<'t, 'q>>> { | ||||||
|  |  | ||||||
|     let mut positions = BinaryHeap::with_capacity(branches.len()); |     let mut positions = BinaryHeap::with_capacity(branches.len()); | ||||||
|     for branch in branches { |     for branch in branches { | ||||||
|         let mut branch_positions = Vec::with_capacity(branch.len()); |         let mut branch_positions = Vec::with_capacity(branch.len()); | ||||||
|         for queries in  branch { |         for queries in branch { | ||||||
|             match QueryLevelIterator::new(ctx, queries, wdcache)? { |             match QueryLevelIterator::new(ctx, queries, wdcache)? { | ||||||
|                 Some(qli) => branch_positions.push(qli), |                 Some(qli) => branch_positions.push(qli), | ||||||
|                 None => { |                 None => { | ||||||
|                     // the branch seems to be invalid, so we skip it. |                     // the branch seems to be invalid, so we skip it. | ||||||
|                     branch_positions.clear(); |                     branch_positions.clear(); | ||||||
|                     break; |                     break; | ||||||
|                 }, |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         // QueryLevelIterator need to be sorted by level and folded in descending order. |         // QueryLevelIterator need to be sorted by level and folded in descending order. | ||||||
|         branch_positions.sort_unstable_by_key(|qli| qli.level); |         branch_positions.sort_unstable_by_key(|qli| qli.level); | ||||||
|         let folded_query_level_iterators = branch_positions |         let folded_query_level_iterators = | ||||||
|             .into_iter() |             branch_positions.into_iter().fold(None, |fold: Option<QueryLevelIterator>, mut qli| { | ||||||
|             .fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold { |                 match fold { | ||||||
|                 Some(fold) => { |                     Some(fold) => { | ||||||
|                     qli.parent(fold); |                         qli.parent(fold); | ||||||
|                     Some(qli) |                         Some(qli) | ||||||
|                 }, |                     } | ||||||
|                 None => Some(qli), |                     None => Some(qli), | ||||||
|         }); |                 } | ||||||
|  |             }); | ||||||
|  |  | ||||||
|         if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { |         if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { | ||||||
|             let tree_level = folded_query_level_iterators.level; |             let tree_level = folded_query_level_iterators.level; | ||||||
| @@ -526,9 +612,9 @@ fn set_compute_candidates<'t>( | |||||||
|     branches: &FlattenedQueryTree, |     branches: &FlattenedQueryTree, | ||||||
|     allowed_candidates: &RoaringBitmap, |     allowed_candidates: &RoaringBitmap, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<Option<RoaringBitmap>> | ) -> Result<Option<RoaringBitmap>> { | ||||||
| { |     let mut branches_heap = | ||||||
|     let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; |         initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; | ||||||
|     let lowest_level = TreeLevel::min_value(); |     let lowest_level = TreeLevel::min_value(); | ||||||
|     let mut final_candidates: Option<(u32, RoaringBitmap)> = None; |     let mut final_candidates: Option<(u32, RoaringBitmap)> = None; | ||||||
|     let mut allowed_candidates = allowed_candidates.clone(); |     let mut allowed_candidates = allowed_candidates.clone(); | ||||||
| @@ -539,15 +625,18 @@ fn set_compute_candidates<'t>( | |||||||
|         // if current is worst than best we break to return |         // if current is worst than best we break to return | ||||||
|         // candidates that correspond to the best rank |         // candidates that correspond to the best rank | ||||||
|         if let Some((best_rank, _)) = final_candidates { |         if let Some((best_rank, _)) = final_candidates { | ||||||
|             if branch_rank > best_rank { break } |             if branch_rank > best_rank { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|         let _left = branch.last_result.0; |         let _left = branch.last_result.0; | ||||||
|         let candidates = take(&mut branch.last_result.2); |         let candidates = take(&mut branch.last_result.2); | ||||||
|         if candidates.is_empty() { |         if candidates.is_empty() { | ||||||
|             // we don't have candidates, get next interval. |             // we don't have candidates, get next interval. | ||||||
|             if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } |             if !branch.next(&allowed_candidates)? { | ||||||
|         } |                 PeekMut::pop(branch); | ||||||
|         else if is_lowest_level { |             } | ||||||
|  |         } else if is_lowest_level { | ||||||
|             // we have candidates, but we can't dig deeper. |             // we have candidates, but we can't dig deeper. | ||||||
|             allowed_candidates -= &candidates; |             allowed_candidates -= &candidates; | ||||||
|             final_candidates = match final_candidates.take() { |             final_candidates = match final_candidates.take() { | ||||||
| @@ -556,19 +645,20 @@ fn set_compute_candidates<'t>( | |||||||
|                     best_candidates |= candidates; |                     best_candidates |= candidates; | ||||||
|                     branch.lazy_next(); |                     branch.lazy_next(); | ||||||
|                     Some((best_rank, best_candidates)) |                     Some((best_rank, best_candidates)) | ||||||
|                 }, |                 } | ||||||
|                 // we take current candidates as best candidates |                 // we take current candidates as best candidates | ||||||
|                 None => { |                 None => { | ||||||
|                     branch.lazy_next(); |                     branch.lazy_next(); | ||||||
|                     Some((branch_rank, candidates)) |                     Some((branch_rank, candidates)) | ||||||
|                 }, |                 } | ||||||
|             }; |             }; | ||||||
|         } else { |         } else { | ||||||
|             // we have candidates, lets dig deeper in levels. |             // we have candidates, lets dig deeper in levels. | ||||||
|             branch.dig(ctx)?; |             branch.dig(ctx)?; | ||||||
|             if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } |             if !branch.next(&allowed_candidates)? { | ||||||
|  |                 PeekMut::pop(branch); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(final_candidates.map(|(_rank, candidates)| candidates)) |     Ok(final_candidates.map(|(_rank, candidates)| candidates)) | ||||||
| @@ -578,9 +668,11 @@ fn linear_compute_candidates( | |||||||
|     ctx: &dyn Context, |     ctx: &dyn Context, | ||||||
|     branches: &FlattenedQueryTree, |     branches: &FlattenedQueryTree, | ||||||
|     allowed_candidates: &RoaringBitmap, |     allowed_candidates: &RoaringBitmap, | ||||||
| ) -> Result<BTreeMap<u64, RoaringBitmap>> | ) -> Result<BTreeMap<u64, RoaringBitmap>> { | ||||||
| { |     fn compute_candidate_rank( | ||||||
|     fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 { |         branches: &FlattenedQueryTree, | ||||||
|  |         words_positions: HashMap<String, RoaringBitmap>, | ||||||
|  |     ) -> u64 { | ||||||
|         let mut min_rank = u64::max_value(); |         let mut min_rank = u64::max_value(); | ||||||
|         for branch in branches { |         for branch in branches { | ||||||
|             let branch_len = branch.len(); |             let branch_len = branch.len(); | ||||||
| @@ -593,17 +685,20 @@ fn linear_compute_candidates( | |||||||
|                         QueryKind::Exact { word, .. } => { |                         QueryKind::Exact { word, .. } => { | ||||||
|                             if *prefix { |                             if *prefix { | ||||||
|                                 word_derivations(word, true, 0, &words_positions) |                                 word_derivations(word, true, 0, &words_positions) | ||||||
|                                     .flat_map(|positions| positions.iter().next()).min() |                                     .flat_map(|positions| positions.iter().next()) | ||||||
|  |                                     .min() | ||||||
|                             } else { |                             } else { | ||||||
|                                 words_positions.get(word) |                                 words_positions | ||||||
|  |                                     .get(word) | ||||||
|                                     .map(|positions| positions.iter().next()) |                                     .map(|positions| positions.iter().next()) | ||||||
|                                     .flatten() |                                     .flatten() | ||||||
|                             } |                             } | ||||||
|                         }, |                         } | ||||||
|                         QueryKind::Tolerant { typo, word } => { |                         QueryKind::Tolerant { typo, word } => { | ||||||
|                             word_derivations(word, *prefix, *typo, &words_positions) |                             word_derivations(word, *prefix, *typo, &words_positions) | ||||||
|                                 .flat_map(|positions| positions.iter().next()).min() |                                 .flat_map(|positions| positions.iter().next()) | ||||||
|                         }, |                                 .min() | ||||||
|  |                         } | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
|                     match (position, current_position) { |                     match (position, current_position) { | ||||||
| @@ -627,9 +722,11 @@ fn linear_compute_candidates( | |||||||
|                 branch_rank.sort_unstable(); |                 branch_rank.sort_unstable(); | ||||||
|                 // because several words in same query can't match all a the position 0, |                 // because several words in same query can't match all a the position 0, | ||||||
|                 // we substract the word index to the position. |                 // we substract the word index to the position. | ||||||
|                 let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); |                 let branch_rank: u64 = | ||||||
|  |                     branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); | ||||||
|                 // here we do the means of the words of the branch |                 // here we do the means of the words of the branch | ||||||
|                 min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); |                 min_rank = | ||||||
|  |                     min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -641,8 +738,7 @@ fn linear_compute_candidates( | |||||||
|         is_prefix: bool, |         is_prefix: bool, | ||||||
|         max_typo: u8, |         max_typo: u8, | ||||||
|         words_positions: &'a HashMap<String, RoaringBitmap>, |         words_positions: &'a HashMap<String, RoaringBitmap>, | ||||||
|     ) -> impl Iterator<Item = &'a RoaringBitmap> |     ) -> impl Iterator<Item = &'a RoaringBitmap> { | ||||||
|     { |  | ||||||
|         let dfa = build_dfa(word, max_typo, is_prefix); |         let dfa = build_dfa(word, max_typo, is_prefix); | ||||||
|         words_positions.iter().filter_map(move |(document_word, positions)| { |         words_positions.iter().filter_map(move |(document_word, positions)| { | ||||||
|             use levenshtein_automata::Distance; |             use levenshtein_automata::Distance; | ||||||
| @@ -680,25 +776,26 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { | |||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 out |                 out | ||||||
|             }, |             } | ||||||
|             None => recurse(head), |             None => recurse(head), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn recurse(op: &Operation) -> FlattenedQueryTree { |     fn recurse(op: &Operation) -> FlattenedQueryTree { | ||||||
|         match op { |         match op { | ||||||
|             And(ops) => { |             And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)), | ||||||
|                 ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) |             Or(_, ops) => { | ||||||
|             }, |                 if ops.iter().all(|op| op.query().is_some()) { | ||||||
|             Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { |                     vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] | ||||||
|                 vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] |                 } else { | ||||||
|             } else { |                     ops.iter().map(recurse).flatten().collect() | ||||||
|                 ops.iter().map(recurse).flatten().collect() |                 } | ||||||
|             }, |             } | ||||||
|             Phrase(words) => { |             Phrase(words) => { | ||||||
|                 let queries = words.iter().map(|word| { |                 let queries = words | ||||||
|                     vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}] |                     .iter() | ||||||
|                 }).collect(); |                     .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) | ||||||
|  |                     .collect(); | ||||||
|                 vec![queries] |                 vec![queries] | ||||||
|             } |             } | ||||||
|             Operation::Query(query) => vec![vec![vec![query.clone()]]], |             Operation::Query(query) => vec![vec![vec![query.clone()]]], | ||||||
| @@ -712,28 +809,43 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { | |||||||
| mod tests { | mod tests { | ||||||
|     use big_s::S; |     use big_s::S; | ||||||
|  |  | ||||||
|     use crate::search::criteria::QueryKind; |  | ||||||
|     use super::*; |     use super::*; | ||||||
|  |     use crate::search::criteria::QueryKind; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn simple_flatten_query_tree() { |     fn simple_flatten_query_tree() { | ||||||
|         let query_tree = Operation::Or(false, vec![ |         let query_tree = Operation::Or( | ||||||
|             Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), |             false, | ||||||
|             Operation::And(vec![ |             vec![ | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), |                 Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), |                 Operation::And(vec![ | ||||||
|             ]), |                     Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), | ||||||
|             Operation::And(vec![ |                     Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), |  | ||||||
|                 Operation::Or(false, vec![ |  | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), |  | ||||||
|                     Operation::And(vec![ |  | ||||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), |  | ||||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), |  | ||||||
|                     ]), |  | ||||||
|                 ]), |                 ]), | ||||||
|             ]), |                 Operation::And(vec![ | ||||||
|         ]); |                     Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), | ||||||
|  |                     Operation::Or( | ||||||
|  |                         false, | ||||||
|  |                         vec![ | ||||||
|  |                             Operation::Query(Query { | ||||||
|  |                                 prefix: false, | ||||||
|  |                                 kind: QueryKind::exact(S("thefish")), | ||||||
|  |                             }), | ||||||
|  |                             Operation::And(vec![ | ||||||
|  |                                 Operation::Query(Query { | ||||||
|  |                                     prefix: false, | ||||||
|  |                                     kind: QueryKind::exact(S("the")), | ||||||
|  |                                 }), | ||||||
|  |                                 Operation::Query(Query { | ||||||
|  |                                     prefix: false, | ||||||
|  |                                     kind: QueryKind::exact(S("fish")), | ||||||
|  |                                 }), | ||||||
|  |                             ]), | ||||||
|  |                         ], | ||||||
|  |                     ), | ||||||
|  |                 ]), | ||||||
|  |             ], | ||||||
|  |         ); | ||||||
|  |  | ||||||
|         let expected = vec![ |         let expected = vec![ | ||||||
|             vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], |             vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], | ||||||
|   | |||||||
| @@ -2,19 +2,15 @@ use std::convert::TryFrom; | |||||||
| use std::mem::take; | use std::mem::take; | ||||||
| use std::ops::BitOr; | use std::ops::BitOr; | ||||||
|  |  | ||||||
|  | use itertools::Itertools; | ||||||
| use log::debug; | use log::debug; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use itertools::Itertools; |  | ||||||
|  |  | ||||||
| use crate::search::query_tree::{Operation, PrimitiveQueryPart}; |  | ||||||
| use crate::search::criteria::{ | use crate::search::criteria::{ | ||||||
|     Context, |     resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, | ||||||
|     Criterion, |  | ||||||
|     CriterionParameters, |  | ||||||
|     CriterionResult, |  | ||||||
|     resolve_query_tree, |  | ||||||
| }; | }; | ||||||
| use crate::{TreeLevel, Result}; | use crate::search::query_tree::{Operation, PrimitiveQueryPart}; | ||||||
|  | use crate::{Result, TreeLevel}; | ||||||
|  |  | ||||||
| pub struct Exactness<'t> { | pub struct Exactness<'t> { | ||||||
|     ctx: &'t dyn Context<'t>, |     ctx: &'t dyn Context<'t>, | ||||||
| @@ -26,7 +22,11 @@ pub struct Exactness<'t> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t> Exactness<'t> { | impl<'t> Exactness<'t> { | ||||||
|     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>, primitive_query: &[PrimitiveQueryPart]) -> heed::Result<Self> { |     pub fn new( | ||||||
|  |         ctx: &'t dyn Context<'t>, | ||||||
|  |         parent: Box<dyn Criterion + 't>, | ||||||
|  |         primitive_query: &[PrimitiveQueryPart], | ||||||
|  |     ) -> heed::Result<Self> { | ||||||
|         let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); |         let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); | ||||||
|         for part in primitive_query { |         for part in primitive_query { | ||||||
|             query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); |             query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); | ||||||
| @@ -59,7 +59,7 @@ impl<'t> Criterion for Exactness<'t> { | |||||||
|                     // reset state |                     // reset state | ||||||
|                     self.state = None; |                     self.state = None; | ||||||
|                     self.query_tree = None; |                     self.query_tree = None; | ||||||
|                 }, |                 } | ||||||
|                 Some(state) => { |                 Some(state) => { | ||||||
|                     let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; |                     let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; | ||||||
|                     self.state = state; |                     self.state = state; | ||||||
| @@ -70,40 +70,51 @@ impl<'t> Criterion for Exactness<'t> { | |||||||
|                         filtered_candidates: None, |                         filtered_candidates: None, | ||||||
|                         bucket_candidates: Some(take(&mut self.bucket_candidates)), |                         bucket_candidates: Some(take(&mut self.bucket_candidates)), | ||||||
|                     })); |                     })); | ||||||
|                 }, |                 } | ||||||
|                 None => { |                 None => match self.parent.next(params)? { | ||||||
|                     match self.parent.next(params)? { |                     Some(CriterionResult { | ||||||
|                         Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { |                         query_tree: Some(query_tree), | ||||||
|                             let mut candidates = match candidates { |                         candidates, | ||||||
|                                 Some(candidates) => candidates, |                         filtered_candidates, | ||||||
|                                 None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, |                         bucket_candidates, | ||||||
|                             }; |                     }) => { | ||||||
|  |                         let mut candidates = match candidates { | ||||||
|                             if let Some(filtered_candidates) = filtered_candidates { |                             Some(candidates) => candidates, | ||||||
|                                 candidates &= filtered_candidates; |                             None => { | ||||||
|  |                                 resolve_query_tree(self.ctx, &query_tree, params.wdcache)? | ||||||
|  |                                     - params.excluded_candidates | ||||||
|                             } |                             } | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|                             match bucket_candidates { |                         if let Some(filtered_candidates) = filtered_candidates { | ||||||
|                                 Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, |                             candidates &= filtered_candidates; | ||||||
|                                 None => self.bucket_candidates |= &candidates, |                         } | ||||||
|                             } |  | ||||||
|  |  | ||||||
|                             self.state = Some(State::new(candidates)); |                         match bucket_candidates { | ||||||
|                             self.query_tree = Some(query_tree); |                             Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, | ||||||
|                         }, |                             None => self.bucket_candidates |= &candidates, | ||||||
|                         Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { |                         } | ||||||
|                             return Ok(Some(CriterionResult { |  | ||||||
|                                 query_tree: None, |                         self.state = Some(State::new(candidates)); | ||||||
|                                 candidates, |                         self.query_tree = Some(query_tree); | ||||||
|                                 filtered_candidates, |  | ||||||
|                                 bucket_candidates, |  | ||||||
|                             })); |  | ||||||
|                         }, |  | ||||||
|                         None => return Ok(None), |  | ||||||
|                     } |                     } | ||||||
|  |                     Some(CriterionResult { | ||||||
|  |                         query_tree: None, | ||||||
|  |                         candidates, | ||||||
|  |                         filtered_candidates, | ||||||
|  |                         bucket_candidates, | ||||||
|  |                     }) => { | ||||||
|  |                         return Ok(Some(CriterionResult { | ||||||
|  |                             query_tree: None, | ||||||
|  |                             candidates, | ||||||
|  |                             filtered_candidates, | ||||||
|  |                             bucket_candidates, | ||||||
|  |                         })); | ||||||
|  |                     } | ||||||
|  |                     None => return Ok(None), | ||||||
|                 }, |                 }, | ||||||
|             } |             } | ||||||
|          } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -125,9 +136,9 @@ impl State { | |||||||
|  |  | ||||||
|     fn difference_with(&mut self, lhs: &RoaringBitmap) { |     fn difference_with(&mut self, lhs: &RoaringBitmap) { | ||||||
|         match self { |         match self { | ||||||
|             Self::ExactAttribute(candidates) | |             Self::ExactAttribute(candidates) | ||||||
|             Self::AttributeStartsWith(candidates) | |             | Self::AttributeStartsWith(candidates) | ||||||
|             Self::ExactWords(candidates) => *candidates -= lhs, |             | Self::ExactWords(candidates) => *candidates -= lhs, | ||||||
|             Self::Remainings(candidates_array) => { |             Self::Remainings(candidates_array) => { | ||||||
|                 candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); |                 candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); | ||||||
|                 candidates_array.retain(|candidates| !candidates.is_empty()); |                 candidates_array.retain(|candidates| !candidates.is_empty()); | ||||||
| @@ -137,9 +148,9 @@ impl State { | |||||||
|  |  | ||||||
|     fn is_empty(&self) -> bool { |     fn is_empty(&self) -> bool { | ||||||
|         match self { |         match self { | ||||||
|             Self::ExactAttribute(candidates) | |             Self::ExactAttribute(candidates) | ||||||
|             Self::AttributeStartsWith(candidates) | |             | Self::AttributeStartsWith(candidates) | ||||||
|             Self::ExactWords(candidates) => candidates.is_empty(), |             | Self::ExactWords(candidates) => candidates.is_empty(), | ||||||
|             Self::Remainings(candidates_array) => { |             Self::Remainings(candidates_array) => { | ||||||
|                 candidates_array.iter().all(RoaringBitmap::is_empty) |                 candidates_array.iter().all(RoaringBitmap::is_empty) | ||||||
|             } |             } | ||||||
| @@ -158,8 +169,7 @@ fn resolve_state( | |||||||
|     ctx: &dyn Context, |     ctx: &dyn Context, | ||||||
|     state: State, |     state: State, | ||||||
|     query: &[ExactQueryPart], |     query: &[ExactQueryPart], | ||||||
| ) -> Result<(RoaringBitmap, Option<State>)> | ) -> Result<(RoaringBitmap, Option<State>)> { | ||||||
| { |  | ||||||
|     use State::*; |     use State::*; | ||||||
|     match state { |     match state { | ||||||
|         ExactAttribute(mut allowed_candidates) => { |         ExactAttribute(mut allowed_candidates) => { | ||||||
| @@ -167,8 +177,11 @@ fn resolve_state( | |||||||
|             if let Ok(query_len) = u8::try_from(query.len()) { |             if let Ok(query_len) = u8::try_from(query.len()) { | ||||||
|                 let attributes_ids = ctx.searchable_fields_ids()?; |                 let attributes_ids = ctx.searchable_fields_ids()?; | ||||||
|                 for id in attributes_ids { |                 for id in attributes_ids { | ||||||
|                     if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { |                     if let Some(attribute_allowed_docids) = | ||||||
|                         let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; |                         ctx.field_id_word_count_docids(id, query_len)? | ||||||
|  |                     { | ||||||
|  |                         let mut attribute_candidates_array = | ||||||
|  |                             attribute_start_with_docids(ctx, id as u32, query)?; | ||||||
|                         attribute_candidates_array.push(attribute_allowed_docids); |                         attribute_candidates_array.push(attribute_allowed_docids); | ||||||
|                         candidates |= intersection_of(attribute_candidates_array.iter().collect()); |                         candidates |= intersection_of(attribute_candidates_array.iter().collect()); | ||||||
|                     } |                     } | ||||||
| @@ -181,12 +194,13 @@ fn resolve_state( | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) |             Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) | ||||||
|         }, |         } | ||||||
|         AttributeStartsWith(mut allowed_candidates) => { |         AttributeStartsWith(mut allowed_candidates) => { | ||||||
|             let mut candidates = RoaringBitmap::new(); |             let mut candidates = RoaringBitmap::new(); | ||||||
|             let attributes_ids = ctx.searchable_fields_ids()?; |             let attributes_ids = ctx.searchable_fields_ids()?; | ||||||
|             for id in attributes_ids { |             for id in attributes_ids { | ||||||
|                 let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; |                 let attribute_candidates_array = | ||||||
|  |                     attribute_start_with_docids(ctx, id as u32, query)?; | ||||||
|                 candidates |= intersection_of(attribute_candidates_array.iter().collect()); |                 candidates |= intersection_of(attribute_candidates_array.iter().collect()); | ||||||
|             } |             } | ||||||
|  |  | ||||||
| @@ -195,7 +209,7 @@ fn resolve_state( | |||||||
|             // remove current candidates from allowed candidates |             // remove current candidates from allowed candidates | ||||||
|             allowed_candidates -= &candidates; |             allowed_candidates -= &candidates; | ||||||
|             Ok((candidates, Some(ExactWords(allowed_candidates)))) |             Ok((candidates, Some(ExactWords(allowed_candidates)))) | ||||||
|         }, |         } | ||||||
|         ExactWords(mut allowed_candidates) => { |         ExactWords(mut allowed_candidates) => { | ||||||
|             let number_of_part = query.len(); |             let number_of_part = query.len(); | ||||||
|             let mut parts_candidates_array = Vec::with_capacity(number_of_part); |             let mut parts_candidates_array = Vec::with_capacity(number_of_part); | ||||||
| @@ -210,7 +224,7 @@ fn resolve_state( | |||||||
|                                 candidates |= synonym_candidates; |                                 candidates |= synonym_candidates; | ||||||
|                             } |                             } | ||||||
|                         } |                         } | ||||||
|                     }, |                     } | ||||||
|                     // compute intersection on pair of words with a proximity of 0. |                     // compute intersection on pair of words with a proximity of 0. | ||||||
|                     Phrase(phrase) => { |                     Phrase(phrase) => { | ||||||
|                         let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); |                         let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); | ||||||
| @@ -220,8 +234,8 @@ fn resolve_state( | |||||||
|                                     Some(docids) => bitmaps.push(docids), |                                     Some(docids) => bitmaps.push(docids), | ||||||
|                                     None => { |                                     None => { | ||||||
|                                         bitmaps.clear(); |                                         bitmaps.clear(); | ||||||
|                                         break |                                         break; | ||||||
|                                     }, |                                     } | ||||||
|                                 } |                                 } | ||||||
|                             } |                             } | ||||||
|                         } |                         } | ||||||
| @@ -247,7 +261,7 @@ fn resolve_state( | |||||||
|                     // intersect each word candidates in combinations |                     // intersect each word candidates in combinations | ||||||
|                     .map(intersection_of) |                     .map(intersection_of) | ||||||
|                     // union combinations of `c_count` exact words |                     // union combinations of `c_count` exact words | ||||||
|                     .fold(RoaringBitmap::new(),  RoaringBitmap::bitor); |                     .fold(RoaringBitmap::new(), RoaringBitmap::bitor); | ||||||
|                 // only keep allowed candidates |                 // only keep allowed candidates | ||||||
|                 combinations_candidates &= &allowed_candidates; |                 combinations_candidates &= &allowed_candidates; | ||||||
|                 // remove current candidates from allowed candidates |                 // remove current candidates from allowed candidates | ||||||
| @@ -261,7 +275,7 @@ fn resolve_state( | |||||||
|             candidates_array.reverse(); |             candidates_array.reverse(); | ||||||
|  |  | ||||||
|             Ok((all_exact_candidates, Some(Remainings(candidates_array)))) |             Ok((all_exact_candidates, Some(Remainings(candidates_array)))) | ||||||
|         }, |         } | ||||||
|         // pop remainings candidates until the emptiness |         // pop remainings candidates until the emptiness | ||||||
|         Remainings(mut candidates_array) => { |         Remainings(mut candidates_array) => { | ||||||
|             let candidates = candidates_array.pop().unwrap_or_default(); |             let candidates = candidates_array.pop().unwrap_or_default(); | ||||||
| @@ -270,12 +284,15 @@ fn resolve_state( | |||||||
|             } else { |             } else { | ||||||
|                 Ok((candidates, None)) |                 Ok((candidates, None)) | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|  |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result<Vec<RoaringBitmap>> { | fn attribute_start_with_docids( | ||||||
|  |     ctx: &dyn Context, | ||||||
|  |     attribute_id: u32, | ||||||
|  |     query: &[ExactQueryPart], | ||||||
|  | ) -> heed::Result<Vec<RoaringBitmap>> { | ||||||
|     let lowest_level = TreeLevel::min_value(); |     let lowest_level = TreeLevel::min_value(); | ||||||
|     let mut attribute_candidates_array = Vec::new(); |     let mut attribute_candidates_array = Vec::new(); | ||||||
|     // start from attribute first position |     // start from attribute first position | ||||||
| @@ -293,7 +310,7 @@ fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[Ex | |||||||
|                 } |                 } | ||||||
|                 attribute_candidates_array.push(synonyms_candidates); |                 attribute_candidates_array.push(synonyms_candidates); | ||||||
|                 pos += 1; |                 pos += 1; | ||||||
|             }, |             } | ||||||
|             Phrase(phrase) => { |             Phrase(phrase) => { | ||||||
|                 for word in phrase { |                 for word in phrase { | ||||||
|                     let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; |                     let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; | ||||||
| @@ -325,24 +342,30 @@ pub enum ExactQueryPart { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl ExactQueryPart { | impl ExactQueryPart { | ||||||
|     fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result<Self> { |     fn from_primitive_query_part( | ||||||
|  |         ctx: &dyn Context, | ||||||
|  |         part: &PrimitiveQueryPart, | ||||||
|  |     ) -> heed::Result<Self> { | ||||||
|         let part = match part { |         let part = match part { | ||||||
|             PrimitiveQueryPart::Word(word, _) => { |             PrimitiveQueryPart::Word(word, _) => { | ||||||
|                 match ctx.synonyms(word)? { |                 match ctx.synonyms(word)? { | ||||||
|                     Some(synonyms) => { |                     Some(synonyms) => { | ||||||
|                         let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| { |                         let mut synonyms: Vec<_> = synonyms | ||||||
|                             // keep 1 word synonyms only. |                             .into_iter() | ||||||
|                             match array.pop() { |                             .filter_map(|mut array| { | ||||||
|                                 Some(word) if array.is_empty() => Some(word), |                                 // keep 1 word synonyms only. | ||||||
|                                 _ => None, |                                 match array.pop() { | ||||||
|                             } |                                     Some(word) if array.is_empty() => Some(word), | ||||||
|                         }).collect(); |                                     _ => None, | ||||||
|  |                                 } | ||||||
|  |                             }) | ||||||
|  |                             .collect(); | ||||||
|                         synonyms.push(word.clone()); |                         synonyms.push(word.clone()); | ||||||
|                         ExactQueryPart::Synonyms(synonyms) |                         ExactQueryPart::Synonyms(synonyms) | ||||||
|                     }, |                     } | ||||||
|                     None => ExactQueryPart::Synonyms(vec![word.clone()]), |                     None => ExactQueryPart::Synonyms(vec![word.clone()]), | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), |             PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,10 +1,10 @@ | |||||||
| use log::debug; | use log::debug; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::Result; | use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; | ||||||
| use crate::search::query_tree::Operation; | use crate::search::query_tree::Operation; | ||||||
| use crate::search::WordDerivationsCache; | use crate::search::WordDerivationsCache; | ||||||
| use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context}; | use crate::Result; | ||||||
|  |  | ||||||
| /// The result of a call to the fetcher. | /// The result of a call to the fetcher. | ||||||
| #[derive(Debug, Clone, PartialEq)] | #[derive(Debug, Clone, PartialEq)] | ||||||
| @@ -26,7 +26,12 @@ pub struct Final<'t> { | |||||||
|  |  | ||||||
| impl<'t> Final<'t> { | impl<'t> Final<'t> { | ||||||
|     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> { |     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> { | ||||||
|         Final { ctx, parent, wdcache: WordDerivationsCache::new(), returned_candidates: RoaringBitmap::new() } |         Final { | ||||||
|  |             ctx, | ||||||
|  |             parent, | ||||||
|  |             wdcache: WordDerivationsCache::new(), | ||||||
|  |             returned_candidates: RoaringBitmap::new(), | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[logging_timer::time("Final::{}")] |     #[logging_timer::time("Final::{}")] | ||||||
| @@ -40,10 +45,17 @@ impl<'t> Final<'t> { | |||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         match self.parent.next(&mut criterion_parameters)? { |         match self.parent.next(&mut criterion_parameters)? { | ||||||
|             Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { |             Some(CriterionResult { | ||||||
|  |                 query_tree, | ||||||
|  |                 candidates, | ||||||
|  |                 filtered_candidates, | ||||||
|  |                 bucket_candidates, | ||||||
|  |             }) => { | ||||||
|                 let mut candidates = match (candidates, query_tree.as_ref()) { |                 let mut candidates = match (candidates, query_tree.as_ref()) { | ||||||
|                     (Some(candidates), _) => candidates, |                     (Some(candidates), _) => candidates, | ||||||
|                     (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates, |                     (None, Some(qt)) => { | ||||||
|  |                         resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates | ||||||
|  |                     } | ||||||
|                     (None, None) => self.ctx.documents_ids()? - excluded_candidates, |                     (None, None) => self.ctx.documents_ids()? - excluded_candidates, | ||||||
|                 }; |                 }; | ||||||
|  |  | ||||||
| @@ -56,7 +68,7 @@ impl<'t> Final<'t> { | |||||||
|                 self.returned_candidates |= &candidates; |                 self.returned_candidates |= &candidates; | ||||||
|  |  | ||||||
|                 Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) |                 Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) | ||||||
|             }, |             } | ||||||
|             None => Ok(None), |             None => Ok(None), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,15 +1,18 @@ | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::Result; | use super::{Criterion, CriterionParameters, CriterionResult}; | ||||||
| use crate::search::query_tree::Operation; | use crate::search::query_tree::Operation; | ||||||
| use super::{Criterion, CriterionResult, CriterionParameters}; | use crate::Result; | ||||||
|  |  | ||||||
| pub struct Initial { | pub struct Initial { | ||||||
|     answer: Option<CriterionResult> |     answer: Option<CriterionResult>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Initial { | impl Initial { | ||||||
|     pub fn new(query_tree: Option<Operation>, filtered_candidates: Option<RoaringBitmap>) -> Initial { |     pub fn new( | ||||||
|  |         query_tree: Option<Operation>, | ||||||
|  |         filtered_candidates: Option<RoaringBitmap>, | ||||||
|  |     ) -> Initial { | ||||||
|         let answer = CriterionResult { |         let answer = CriterionResult { | ||||||
|             query_tree, |             query_tree, | ||||||
|             candidates: None, |             candidates: None, | ||||||
|   | |||||||
| @@ -1,29 +1,28 @@ | |||||||
| use std::collections::HashMap; |  | ||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
|  | use std::collections::HashMap; | ||||||
|  |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; |  | ||||||
| use crate::{Index, DocumentId, Result}; |  | ||||||
|  |  | ||||||
| use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; |  | ||||||
| use self::asc_desc::AscDesc; | use self::asc_desc::AscDesc; | ||||||
| use self::attribute::Attribute; | use self::attribute::Attribute; | ||||||
| use self::exactness::Exactness; | use self::exactness::Exactness; | ||||||
| use self::r#final::Final; |  | ||||||
| use self::initial::Initial; | use self::initial::Initial; | ||||||
| use self::proximity::Proximity; | use self::proximity::Proximity; | ||||||
|  | use self::r#final::Final; | ||||||
| use self::typo::Typo; | use self::typo::Typo; | ||||||
| use self::words::Words; | use self::words::Words; | ||||||
|  | use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; | ||||||
|  | use crate::search::{word_derivations, WordDerivationsCache}; | ||||||
|  | use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; | ||||||
|  |  | ||||||
| mod asc_desc; | mod asc_desc; | ||||||
| mod attribute; | mod attribute; | ||||||
| mod exactness; | mod exactness; | ||||||
|  | pub mod r#final; | ||||||
| mod initial; | mod initial; | ||||||
| mod proximity; | mod proximity; | ||||||
| mod typo; | mod typo; | ||||||
| mod words; | mod words; | ||||||
| pub mod r#final; |  | ||||||
|  |  | ||||||
| pub trait Criterion { | pub trait Criterion { | ||||||
|     fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>>; |     fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>>; | ||||||
| @@ -55,7 +54,7 @@ pub struct CriterionParameters<'a> { | |||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| enum Candidates { | enum Candidates { | ||||||
|     Allowed(RoaringBitmap), |     Allowed(RoaringBitmap), | ||||||
|     Forbidden(RoaringBitmap) |     Forbidden(RoaringBitmap), | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Default for Candidates { | impl Default for Candidates { | ||||||
| @@ -68,17 +67,55 @@ pub trait Context<'c> { | |||||||
|     fn documents_ids(&self) -> heed::Result<RoaringBitmap>; |     fn documents_ids(&self) -> heed::Result<RoaringBitmap>; | ||||||
|     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; |     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; | ||||||
|     fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; |     fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; | ||||||
|     fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; |     fn word_pair_proximity_docids( | ||||||
|     fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; |         &self, | ||||||
|  |         left: &str, | ||||||
|  |         right: &str, | ||||||
|  |         proximity: u8, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>>; | ||||||
|  |     fn word_prefix_pair_proximity_docids( | ||||||
|  |         &self, | ||||||
|  |         left: &str, | ||||||
|  |         right: &str, | ||||||
|  |         proximity: u8, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>>; | ||||||
|     fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; |     fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; | ||||||
|     fn in_prefix_cache(&self, word: &str) -> bool; |     fn in_prefix_cache(&self, word: &str) -> bool; | ||||||
|     fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>; |     fn docid_words_positions( | ||||||
|     fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>; |         &self, | ||||||
|     fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>; |         docid: DocumentId, | ||||||
|  |     ) -> heed::Result<HashMap<String, RoaringBitmap>>; | ||||||
|  |     fn word_position_iterator( | ||||||
|  |         &self, | ||||||
|  |         word: &str, | ||||||
|  |         level: TreeLevel, | ||||||
|  |         in_prefix_cache: bool, | ||||||
|  |         left: Option<u32>, | ||||||
|  |         right: Option<u32>, | ||||||
|  |     ) -> heed::Result< | ||||||
|  |         Box< | ||||||
|  |             dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c, | ||||||
|  |         >, | ||||||
|  |     >; | ||||||
|  |     fn word_position_last_level( | ||||||
|  |         &self, | ||||||
|  |         word: &str, | ||||||
|  |         in_prefix_cache: bool, | ||||||
|  |     ) -> heed::Result<Option<TreeLevel>>; | ||||||
|     fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>; |     fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>; | ||||||
|     fn searchable_fields_ids(&self) ->  Result<Vec<FieldId>>; |     fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>; | ||||||
|     fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>>; |     fn field_id_word_count_docids( | ||||||
|     fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>>; |         &self, | ||||||
|  |         field_id: FieldId, | ||||||
|  |         word_count: u8, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>>; | ||||||
|  |     fn word_level_position_docids( | ||||||
|  |         &self, | ||||||
|  |         word: &str, | ||||||
|  |         level: TreeLevel, | ||||||
|  |         left: u32, | ||||||
|  |         right: u32, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>>; | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct CriteriaBuilder<'t> { | pub struct CriteriaBuilder<'t> { | ||||||
| @@ -101,12 +138,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|         self.index.word_prefix_docids.get(self.rtxn, &word) |         self.index.word_prefix_docids.get(self.rtxn, &word) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { |     fn word_pair_proximity_docids( | ||||||
|  |         &self, | ||||||
|  |         left: &str, | ||||||
|  |         right: &str, | ||||||
|  |         proximity: u8, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|         let key = (left, right, proximity); |         let key = (left, right, proximity); | ||||||
|         self.index.word_pair_proximity_docids.get(self.rtxn, &key) |         self.index.word_pair_proximity_docids.get(self.rtxn, &key) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { |     fn word_prefix_pair_proximity_docids( | ||||||
|  |         &self, | ||||||
|  |         left: &str, | ||||||
|  |         right: &str, | ||||||
|  |         proximity: u8, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|         let key = (left, right, proximity); |         let key = (left, right, proximity); | ||||||
|         self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) |         self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) | ||||||
|     } |     } | ||||||
| @@ -119,7 +166,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|         self.words_prefixes_fst.contains(word) |         self.words_prefixes_fst.contains(word) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { |     fn docid_words_positions( | ||||||
|  |         &self, | ||||||
|  |         docid: DocumentId, | ||||||
|  |     ) -> heed::Result<HashMap<String, RoaringBitmap>> { | ||||||
|         let mut words_positions = HashMap::new(); |         let mut words_positions = HashMap::new(); | ||||||
|         for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { |         for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { | ||||||
|             let ((_, word), positions) = result?; |             let ((_, word), positions) = result?; | ||||||
| @@ -134,9 +184,12 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|         level: TreeLevel, |         level: TreeLevel, | ||||||
|         in_prefix_cache: bool, |         in_prefix_cache: bool, | ||||||
|         left: Option<u32>, |         left: Option<u32>, | ||||||
|         right: Option<u32> |         right: Option<u32>, | ||||||
|     ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> |     ) -> heed::Result< | ||||||
|     { |         Box< | ||||||
|  |             dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c, | ||||||
|  |         >, | ||||||
|  |     > { | ||||||
|         let range = { |         let range = { | ||||||
|             let left = left.unwrap_or(u32::min_value()); |             let left = left.unwrap_or(u32::min_value()); | ||||||
|             let right = right.unwrap_or(u32::max_value()); |             let right = right.unwrap_or(u32::max_value()); | ||||||
| @@ -152,7 +205,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|         Ok(Box::new(db.range(self.rtxn, &range)?)) |         Ok(Box::new(db.range(self.rtxn, &range)?)) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> { |     fn word_position_last_level( | ||||||
|  |         &self, | ||||||
|  |         word: &str, | ||||||
|  |         in_prefix_cache: bool, | ||||||
|  |     ) -> heed::Result<Option<TreeLevel>> { | ||||||
|         let range = { |         let range = { | ||||||
|             let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); |             let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); | ||||||
|             let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); |             let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); | ||||||
| @@ -164,7 +221,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|         }; |         }; | ||||||
|         let last_level = db |         let last_level = db | ||||||
|             .remap_data_type::<heed::types::DecodeIgnore>() |             .remap_data_type::<heed::types::DecodeIgnore>() | ||||||
|             .range(self.rtxn, &range)?.last().transpose()? |             .range(self.rtxn, &range)? | ||||||
|  |             .last() | ||||||
|  |             .transpose()? | ||||||
|             .map(|((_, level, _, _), _)| level); |             .map(|((_, level, _, _), _)| level); | ||||||
|  |  | ||||||
|         Ok(last_level) |         Ok(last_level) | ||||||
| @@ -181,12 +240,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>> { |     fn field_id_word_count_docids( | ||||||
|  |         &self, | ||||||
|  |         field_id: FieldId, | ||||||
|  |         word_count: u8, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|         let key = (field_id, word_count); |         let key = (field_id, word_count); | ||||||
|         self.index.field_id_word_count_docids.get(self.rtxn, &key) |         self.index.field_id_word_count_docids.get(self.rtxn, &key) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>> { |     fn word_level_position_docids( | ||||||
|  |         &self, | ||||||
|  |         word: &str, | ||||||
|  |         level: TreeLevel, | ||||||
|  |         left: u32, | ||||||
|  |         right: u32, | ||||||
|  |     ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|         let key = (word, level, left, right); |         let key = (word, level, left, right); | ||||||
|         self.index.word_level_position_docids.get(self.rtxn, &key) |         self.index.word_level_position_docids.get(self.rtxn, &key) | ||||||
|     } |     } | ||||||
| @@ -204,13 +273,13 @@ impl<'t> CriteriaBuilder<'t> { | |||||||
|         query_tree: Option<Operation>, |         query_tree: Option<Operation>, | ||||||
|         primitive_query: Option<Vec<PrimitiveQueryPart>>, |         primitive_query: Option<Vec<PrimitiveQueryPart>>, | ||||||
|         filtered_candidates: Option<RoaringBitmap>, |         filtered_candidates: Option<RoaringBitmap>, | ||||||
|     ) -> Result<Final<'t>> |     ) -> Result<Final<'t>> { | ||||||
|     { |  | ||||||
|         use crate::criterion::Criterion as Name; |         use crate::criterion::Criterion as Name; | ||||||
|  |  | ||||||
|         let primitive_query = primitive_query.unwrap_or_default(); |         let primitive_query = primitive_query.unwrap_or_default(); | ||||||
|  |  | ||||||
|         let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>; |         let mut criterion = | ||||||
|  |             Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>; | ||||||
|         for name in self.index.criteria(&self.rtxn)? { |         for name in self.index.criteria(&self.rtxn)? { | ||||||
|             criterion = match name { |             criterion = match name { | ||||||
|                 Name::Typo => Box::new(Typo::new(self, criterion)), |                 Name::Typo => Box::new(Typo::new(self, criterion)), | ||||||
| @@ -218,8 +287,12 @@ impl<'t> CriteriaBuilder<'t> { | |||||||
|                 Name::Proximity => Box::new(Proximity::new(self, criterion)), |                 Name::Proximity => Box::new(Proximity::new(self, criterion)), | ||||||
|                 Name::Attribute => Box::new(Attribute::new(self, criterion)), |                 Name::Attribute => Box::new(Attribute::new(self, criterion)), | ||||||
|                 Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), |                 Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), | ||||||
|                 Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), |                 Name::Asc(field) => { | ||||||
|                 Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), |                     Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) | ||||||
|  |                 } | ||||||
|  |                 Name::Desc(field) => { | ||||||
|  |                     Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) | ||||||
|  |                 } | ||||||
|             }; |             }; | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -231,21 +304,20 @@ pub fn resolve_query_tree<'t>( | |||||||
|     ctx: &'t dyn Context, |     ctx: &'t dyn Context, | ||||||
|     query_tree: &Operation, |     query_tree: &Operation, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<RoaringBitmap> | ) -> Result<RoaringBitmap> { | ||||||
| { |  | ||||||
|     fn resolve_operation<'t>( |     fn resolve_operation<'t>( | ||||||
|         ctx: &'t dyn Context, |         ctx: &'t dyn Context, | ||||||
|         query_tree: &Operation, |         query_tree: &Operation, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<RoaringBitmap> |     ) -> Result<RoaringBitmap> { | ||||||
|     { |         use Operation::{And, Or, Phrase, Query}; | ||||||
|         use Operation::{And, Phrase, Or, Query}; |  | ||||||
|  |  | ||||||
|         match query_tree { |         match query_tree { | ||||||
|             And(ops) => { |             And(ops) => { | ||||||
|                 let mut ops = ops.iter().map(|op| { |                 let mut ops = ops | ||||||
|                     resolve_operation(ctx, op, wdcache) |                     .iter() | ||||||
|                 }).collect::<Result<Vec<_>>>()?; |                     .map(|op| resolve_operation(ctx, op, wdcache)) | ||||||
|  |                     .collect::<Result<Vec<_>>>()?; | ||||||
|  |  | ||||||
|                 ops.sort_unstable_by_key(|cds| cds.len()); |                 ops.sort_unstable_by_key(|cds| cds.len()); | ||||||
|  |  | ||||||
| @@ -260,7 +332,7 @@ pub fn resolve_query_tree<'t>( | |||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Ok(candidates) |                 Ok(candidates) | ||||||
|             }, |             } | ||||||
|             Phrase(words) => { |             Phrase(words) => { | ||||||
|                 let mut candidates = RoaringBitmap::new(); |                 let mut candidates = RoaringBitmap::new(); | ||||||
|                 let mut first_loop = true; |                 let mut first_loop = true; | ||||||
| @@ -276,12 +348,12 @@ pub fn resolve_query_tree<'t>( | |||||||
|                             } else { |                             } else { | ||||||
|                                 candidates &= pair_docids; |                                 candidates &= pair_docids; | ||||||
|                             } |                             } | ||||||
|                         }, |                         } | ||||||
|                         None => return Ok(RoaringBitmap::new()) |                         None => return Ok(RoaringBitmap::new()), | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Ok(candidates) |                 Ok(candidates) | ||||||
|             }, |             } | ||||||
|             Or(_, ops) => { |             Or(_, ops) => { | ||||||
|                 let mut candidates = RoaringBitmap::new(); |                 let mut candidates = RoaringBitmap::new(); | ||||||
|                 for op in ops { |                 for op in ops { | ||||||
| @@ -289,7 +361,7 @@ pub fn resolve_query_tree<'t>( | |||||||
|                     candidates.union_with(&docids); |                     candidates.union_with(&docids); | ||||||
|                 } |                 } | ||||||
|                 Ok(candidates) |                 Ok(candidates) | ||||||
|             }, |             } | ||||||
|             Query(q) => Ok(query_docids(ctx, q, wdcache)?), |             Query(q) => Ok(query_docids(ctx, q, wdcache)?), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -297,18 +369,18 @@ pub fn resolve_query_tree<'t>( | |||||||
|     resolve_operation(ctx, query_tree, wdcache) |     resolve_operation(ctx, query_tree, wdcache) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>( | fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>( | ||||||
|     ctx: &dyn Context, |     ctx: &dyn Context, | ||||||
|     left_words: &[(T, u8)], |     left_words: &[(T, u8)], | ||||||
|     right_words: &[(U, u8)], |     right_words: &[(U, u8)], | ||||||
|     proximity: u8 |     proximity: u8, | ||||||
| ) -> Result<RoaringBitmap> | ) -> Result<RoaringBitmap> { | ||||||
| { |  | ||||||
|     let mut docids = RoaringBitmap::new(); |     let mut docids = RoaringBitmap::new(); | ||||||
|     for (left, _l_typo) in left_words { |     for (left, _l_typo) in left_words { | ||||||
|         for (right, _r_typo) in right_words { |         for (right, _r_typo) in right_words { | ||||||
|             let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); |             let current_docids = ctx | ||||||
|  |                 .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? | ||||||
|  |                 .unwrap_or_default(); | ||||||
|             docids.union_with(¤t_docids); |             docids.union_with(¤t_docids); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -319,8 +391,7 @@ fn query_docids( | |||||||
|     ctx: &dyn Context, |     ctx: &dyn Context, | ||||||
|     query: &Query, |     query: &Query, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<RoaringBitmap> | ) -> Result<RoaringBitmap> { | ||||||
| { |  | ||||||
|     match &query.kind { |     match &query.kind { | ||||||
|         QueryKind::Exact { word, .. } => { |         QueryKind::Exact { word, .. } => { | ||||||
|             if query.prefix && ctx.in_prefix_cache(&word) { |             if query.prefix && ctx.in_prefix_cache(&word) { | ||||||
| @@ -336,7 +407,7 @@ fn query_docids( | |||||||
|             } else { |             } else { | ||||||
|                 Ok(ctx.word_docids(&word)?.unwrap_or_default()) |                 Ok(ctx.word_docids(&word)?.unwrap_or_default()) | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|         QueryKind::Tolerant { typo, word } => { |         QueryKind::Tolerant { typo, word } => { | ||||||
|             let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; |             let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; | ||||||
|             let mut docids = RoaringBitmap::new(); |             let mut docids = RoaringBitmap::new(); | ||||||
| @@ -345,7 +416,7 @@ fn query_docids( | |||||||
|                 docids.union_with(¤t_docids); |                 docids.union_with(¤t_docids); | ||||||
|             } |             } | ||||||
|             Ok(docids) |             Ok(docids) | ||||||
|         }, |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -355,8 +426,7 @@ fn query_pair_proximity_docids( | |||||||
|     right: &Query, |     right: &Query, | ||||||
|     proximity: u8, |     proximity: u8, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<RoaringBitmap> | ) -> Result<RoaringBitmap> { | ||||||
| { |  | ||||||
|     if proximity >= 8 { |     if proximity >= 8 { | ||||||
|         let mut candidates = query_docids(ctx, left, wdcache)?; |         let mut candidates = query_docids(ctx, left, wdcache)?; | ||||||
|         let right_candidates = query_docids(ctx, right, wdcache)?; |         let right_candidates = query_docids(ctx, right, wdcache)?; | ||||||
| @@ -368,20 +438,31 @@ fn query_pair_proximity_docids( | |||||||
|     match (&left.kind, &right.kind) { |     match (&left.kind, &right.kind) { | ||||||
|         (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { |         (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { | ||||||
|             if prefix && ctx.in_prefix_cache(&right) { |             if prefix && ctx.in_prefix_cache(&right) { | ||||||
|                 Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) |                 Ok(ctx | ||||||
|  |                     .word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? | ||||||
|  |                     .unwrap_or_default()) | ||||||
|             } else if prefix { |             } else if prefix { | ||||||
|                 let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; |                 let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; | ||||||
|                 all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) |                 all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) | ||||||
|             } else { |             } else { | ||||||
|                 Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) |                 Ok(ctx | ||||||
|  |                     .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? | ||||||
|  |                     .unwrap_or_default()) | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|         (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { |         (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { | ||||||
|             let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); |             let l_words = | ||||||
|  |                 word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); | ||||||
|             if prefix && ctx.in_prefix_cache(&right) { |             if prefix && ctx.in_prefix_cache(&right) { | ||||||
|                 let mut docids = RoaringBitmap::new(); |                 let mut docids = RoaringBitmap::new(); | ||||||
|                 for (left, _) in l_words { |                 for (left, _) in l_words { | ||||||
|                     let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); |                     let current_docids = ctx | ||||||
|  |                         .word_prefix_pair_proximity_docids( | ||||||
|  |                             left.as_ref(), | ||||||
|  |                             right.as_ref(), | ||||||
|  |                             proximity, | ||||||
|  |                         )? | ||||||
|  |                         .unwrap_or_default(); | ||||||
|                     docids.union_with(¤t_docids); |                     docids.union_with(¤t_docids); | ||||||
|                 } |                 } | ||||||
|                 Ok(docids) |                 Ok(docids) | ||||||
| @@ -391,28 +472,36 @@ fn query_pair_proximity_docids( | |||||||
|             } else { |             } else { | ||||||
|                 all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) |                 all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|         (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { |         (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { | ||||||
|             let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; |             let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; | ||||||
|             all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) |             all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) | ||||||
|         }, |         } | ||||||
|         (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { |         ( | ||||||
|             let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); |             QueryKind::Tolerant { typo: l_typo, word: left }, | ||||||
|  |             QueryKind::Tolerant { typo: r_typo, word: right }, | ||||||
|  |         ) => { | ||||||
|  |             let l_words = | ||||||
|  |                 word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); | ||||||
|             let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; |             let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; | ||||||
|             all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) |             all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) | ||||||
|         }, |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| pub mod test { | pub mod test { | ||||||
|     use maplit::hashmap; |  | ||||||
|     use rand::{Rng, SeedableRng, rngs::StdRng}; |  | ||||||
|  |  | ||||||
|     use super::*; |  | ||||||
|     use std::collections::HashMap; |     use std::collections::HashMap; | ||||||
|  |  | ||||||
|     fn s(s: &str) -> String { s.to_string() } |     use maplit::hashmap; | ||||||
|  |     use rand::rngs::StdRng; | ||||||
|  |     use rand::{Rng, SeedableRng}; | ||||||
|  |  | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|  |     fn s(s: &str) -> String { | ||||||
|  |         s.to_string() | ||||||
|  |     } | ||||||
|     pub struct TestContext<'t> { |     pub struct TestContext<'t> { | ||||||
|         words_fst: fst::Set<Cow<'t, [u8]>>, |         words_fst: fst::Set<Cow<'t, [u8]>>, | ||||||
|         word_docids: HashMap<String, RoaringBitmap>, |         word_docids: HashMap<String, RoaringBitmap>, | ||||||
| @@ -435,12 +524,22 @@ pub mod test { | |||||||
|             Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) |             Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { |         fn word_pair_proximity_docids( | ||||||
|  |             &self, | ||||||
|  |             left: &str, | ||||||
|  |             right: &str, | ||||||
|  |             proximity: u8, | ||||||
|  |         ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|             let key = (left.to_string(), right.to_string(), proximity.into()); |             let key = (left.to_string(), right.to_string(), proximity.into()); | ||||||
|             Ok(self.word_pair_proximity_docids.get(&key).cloned()) |             Ok(self.word_pair_proximity_docids.get(&key).cloned()) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { |         fn word_prefix_pair_proximity_docids( | ||||||
|  |             &self, | ||||||
|  |             left: &str, | ||||||
|  |             right: &str, | ||||||
|  |             proximity: u8, | ||||||
|  |         ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|             let key = (left.to_string(), right.to_string(), proximity.into()); |             let key = (left.to_string(), right.to_string(), proximity.into()); | ||||||
|             Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) |             Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) | ||||||
|         } |         } | ||||||
| @@ -453,24 +552,44 @@ pub mod test { | |||||||
|             self.word_prefix_docids.contains_key(&word.to_string()) |             self.word_prefix_docids.contains_key(&word.to_string()) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { |         fn docid_words_positions( | ||||||
|  |             &self, | ||||||
|  |             docid: DocumentId, | ||||||
|  |         ) -> heed::Result<HashMap<String, RoaringBitmap>> { | ||||||
|             if let Some(docid_words) = self.docid_words.get(&docid) { |             if let Some(docid_words) = self.docid_words.get(&docid) { | ||||||
|                 Ok(docid_words |                 Ok(docid_words | ||||||
|                     .iter() |                     .iter() | ||||||
|                     .enumerate() |                     .enumerate() | ||||||
|                     .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) |                     .map(|(i, w)| { | ||||||
|                     .collect() |                         (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))) | ||||||
|                 ) |                     }) | ||||||
|  |                     .collect()) | ||||||
|             } else { |             } else { | ||||||
|                 Ok(HashMap::new()) |                 Ok(HashMap::new()) | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> { |         fn word_position_iterator( | ||||||
|  |             &self, | ||||||
|  |             _word: &str, | ||||||
|  |             _level: TreeLevel, | ||||||
|  |             _in_prefix_cache: bool, | ||||||
|  |             _left: Option<u32>, | ||||||
|  |             _right: Option<u32>, | ||||||
|  |         ) -> heed::Result< | ||||||
|  |             Box< | ||||||
|  |                 dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> | ||||||
|  |                     + 'c, | ||||||
|  |             >, | ||||||
|  |         > { | ||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> { |         fn word_position_last_level( | ||||||
|  |             &self, | ||||||
|  |             _word: &str, | ||||||
|  |             _in_prefix_cache: bool, | ||||||
|  |         ) -> heed::Result<Option<TreeLevel>> { | ||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -478,15 +597,25 @@ pub mod test { | |||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn searchable_fields_ids(&self) ->  Result<Vec<FieldId>> { |         fn searchable_fields_ids(&self) -> Result<Vec<FieldId>> { | ||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result<Option<RoaringBitmap>> { |         fn word_level_position_docids( | ||||||
|  |             &self, | ||||||
|  |             _word: &str, | ||||||
|  |             _level: TreeLevel, | ||||||
|  |             _left: u32, | ||||||
|  |             _right: u32, | ||||||
|  |         ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result<Option<RoaringBitmap>> { |         fn field_id_word_count_docids( | ||||||
|  |             &self, | ||||||
|  |             _field_id: FieldId, | ||||||
|  |             _word_count: u8, | ||||||
|  |         ) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -506,7 +635,7 @@ pub mod test { | |||||||
|                 RoaringBitmap::from_sorted_iter(values.into_iter()) |                 RoaringBitmap::from_sorted_iter(values.into_iter()) | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let word_docids = hashmap!{ |             let word_docids = hashmap! { | ||||||
|                 s("hello")      => random_postings(rng,   1500), |                 s("hello")      => random_postings(rng,   1500), | ||||||
|                 s("hi")         => random_postings(rng,   4000), |                 s("hi")         => random_postings(rng,   4000), | ||||||
|                 s("word")       => random_postings(rng,   2500), |                 s("word")       => random_postings(rng,   2500), | ||||||
| @@ -530,7 +659,7 @@ pub mod test { | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let word_prefix_docids = hashmap!{ |             let word_prefix_docids = hashmap! { | ||||||
|                 s("h")   => &word_docids[&s("hello")] | &word_docids[&s("hi")], |                 s("h")   => &word_docids[&s("hello")] | &word_docids[&s("hi")], | ||||||
|                 s("wor") => &word_docids[&s("word")]  | &word_docids[&s("world")], |                 s("wor") => &word_docids[&s("word")]  | &word_docids[&s("world")], | ||||||
|                 s("20")  => &word_docids[&s("2020")]  | &word_docids[&s("2021")], |                 s("20")  => &word_docids[&s("2020")]  | &word_docids[&s("2021")], | ||||||
| @@ -540,7 +669,9 @@ pub mod test { | |||||||
|             let mut word_prefix_pair_proximity_docids = HashMap::new(); |             let mut word_prefix_pair_proximity_docids = HashMap::new(); | ||||||
|             for (lword, lcandidates) in &word_docids { |             for (lword, lcandidates) in &word_docids { | ||||||
|                 for (rword, rcandidates) in &word_docids { |                 for (rword, rcandidates) in &word_docids { | ||||||
|                     if lword == rword { continue } |                     if lword == rword { | ||||||
|  |                         continue; | ||||||
|  |                     } | ||||||
|                     let candidates = lcandidates & rcandidates; |                     let candidates = lcandidates & rcandidates; | ||||||
|                     for candidate in candidates { |                     for candidate in candidates { | ||||||
|                         if let Some(docid_words) = docid_words.get(&candidate) { |                         if let Some(docid_words) = docid_words.get(&candidate) { | ||||||
| @@ -551,24 +682,31 @@ pub mod test { | |||||||
|                             } else { |                             } else { | ||||||
|                                 (s(lword), s(rword), (lposition - rposition + 1) as i32) |                                 (s(lword), s(rword), (lposition - rposition + 1) as i32) | ||||||
|                             }; |                             }; | ||||||
|                             let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); |                             let docids = word_pair_proximity_docids | ||||||
|  |                                 .entry(key) | ||||||
|  |                                 .or_insert(RoaringBitmap::new()); | ||||||
|                             docids.push(candidate); |                             docids.push(candidate); | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 for (pword, pcandidates) in &word_prefix_docids { |                 for (pword, pcandidates) in &word_prefix_docids { | ||||||
|                     if lword.starts_with(pword) { continue } |                     if lword.starts_with(pword) { | ||||||
|  |                         continue; | ||||||
|  |                     } | ||||||
|                     let candidates = lcandidates & pcandidates; |                     let candidates = lcandidates & pcandidates; | ||||||
|                     for candidate in candidates { |                     for candidate in candidates { | ||||||
|                         if let Some(docid_words) = docid_words.get(&candidate) { |                         if let Some(docid_words) = docid_words.get(&candidate) { | ||||||
|                             let lposition = docid_words.iter().position(|w| w == lword).unwrap(); |                             let lposition = docid_words.iter().position(|w| w == lword).unwrap(); | ||||||
|                             let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); |                             let rposition = | ||||||
|  |                                 docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); | ||||||
|                             let key = if lposition < rposition { |                             let key = if lposition < rposition { | ||||||
|                                 (s(lword), s(pword), (rposition - lposition) as i32) |                                 (s(lword), s(pword), (rposition - lposition) as i32) | ||||||
|                             } else { |                             } else { | ||||||
|                                 (s(lword), s(pword), (lposition - rposition + 1) as i32) |                                 (s(lword), s(pword), (lposition - rposition + 1) as i32) | ||||||
|                             }; |                             }; | ||||||
|                             let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); |                             let docids = word_prefix_pair_proximity_docids | ||||||
|  |                                 .entry(key) | ||||||
|  |                                 .or_insert(RoaringBitmap::new()); | ||||||
|                             docids.push(candidate); |                             docids.push(candidate); | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|   | |||||||
| @@ -2,22 +2,16 @@ use std::collections::btree_map::{self, BTreeMap}; | |||||||
| use std::collections::hash_map::HashMap; | use std::collections::hash_map::HashMap; | ||||||
| use std::mem::take; | use std::mem::take; | ||||||
|  |  | ||||||
| use roaring::RoaringBitmap; |  | ||||||
| use log::debug; | use log::debug; | ||||||
|  | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::search::query_tree::{maximum_proximity, Operation, Query}; |  | ||||||
| use crate::search::{build_dfa, WordDerivationsCache}; |  | ||||||
| use crate::search::{query_tree::QueryKind}; |  | ||||||
| use crate::{DocumentId, Position, Result}; |  | ||||||
| use super::{ | use super::{ | ||||||
|     Context, |     query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion, | ||||||
|     Criterion, |     CriterionParameters, CriterionResult, | ||||||
|     CriterionParameters, |  | ||||||
|     CriterionResult, |  | ||||||
|     query_docids, |  | ||||||
|     query_pair_proximity_docids, |  | ||||||
|     resolve_query_tree, |  | ||||||
| }; | }; | ||||||
|  | use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; | ||||||
|  | use crate::search::{build_dfa, WordDerivationsCache}; | ||||||
|  | use crate::{DocumentId, Position, Result}; | ||||||
|  |  | ||||||
| type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; | type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; | ||||||
|  |  | ||||||
| @@ -63,28 +57,33 @@ impl<'t> Criterion for Proximity<'t> { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         loop { |         loop { | ||||||
|             debug!("Proximity at iteration {} (max prox {:?}) ({:?})", |             debug!( | ||||||
|  |                 "Proximity at iteration {} (max prox {:?}) ({:?})", | ||||||
|                 self.proximity, |                 self.proximity, | ||||||
|                 self.state.as_ref().map(|(mp, _, _)| mp), |                 self.state.as_ref().map(|(mp, _, _)| mp), | ||||||
|                 self.state.as_ref().map(|(_, _, cd)| cd), |                 self.state.as_ref().map(|(_, _, cd)| cd), | ||||||
|             ); |             ); | ||||||
|  |  | ||||||
|             match &mut self.state { |             match &mut self.state { | ||||||
|                 Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => { |                 Some((max_prox, _, allowed_candidates)) | ||||||
|  |                     if allowed_candidates.is_empty() || self.proximity > *max_prox => | ||||||
|  |                 { | ||||||
|                     self.state = None; // reset state |                     self.state = None; // reset state | ||||||
|                 }, |                 } | ||||||
|                 Some((_, query_tree, allowed_candidates)) => { |                 Some((_, query_tree, allowed_candidates)) => { | ||||||
|                     let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { |                     let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD | ||||||
|  |                         && self.proximity > PROXIMITY_THRESHOLD | ||||||
|  |                     { | ||||||
|                         if let Some(cache) = self.plane_sweep_cache.as_mut() { |                         if let Some(cache) = self.plane_sweep_cache.as_mut() { | ||||||
|                             match cache.next() { |                             match cache.next() { | ||||||
|                                 Some((p, candidates)) => { |                                 Some((p, candidates)) => { | ||||||
|                                     self.proximity = p; |                                     self.proximity = p; | ||||||
|                                     candidates |                                     candidates | ||||||
|                                 }, |                                 } | ||||||
|                                 None => { |                                 None => { | ||||||
|                                     self.state = None; // reset state |                                     self.state = None; // reset state | ||||||
|                                     continue |                                     continue; | ||||||
|                                 }, |                                 } | ||||||
|                             } |                             } | ||||||
|                         } else { |                         } else { | ||||||
|                             let cache = resolve_plane_sweep_candidates( |                             let cache = resolve_plane_sweep_candidates( | ||||||
| @@ -95,9 +94,10 @@ impl<'t> Criterion for Proximity<'t> { | |||||||
|                             )?; |                             )?; | ||||||
|                             self.plane_sweep_cache = Some(cache.into_iter()); |                             self.plane_sweep_cache = Some(cache.into_iter()); | ||||||
|  |  | ||||||
|                             continue |                             continue; | ||||||
|                         } |                         } | ||||||
|                     } else { // use set theory based algorithm |                     } else { | ||||||
|  |                         // use set theory based algorithm | ||||||
|                         resolve_candidates( |                         resolve_candidates( | ||||||
|                             self.ctx, |                             self.ctx, | ||||||
|                             &query_tree, |                             &query_tree, | ||||||
| @@ -117,39 +117,50 @@ impl<'t> Criterion for Proximity<'t> { | |||||||
|                         filtered_candidates: None, |                         filtered_candidates: None, | ||||||
|                         bucket_candidates: Some(take(&mut self.bucket_candidates)), |                         bucket_candidates: Some(take(&mut self.bucket_candidates)), | ||||||
|                     })); |                     })); | ||||||
|                 }, |                 } | ||||||
|                 None => { |                 None => match self.parent.next(params)? { | ||||||
|                     match self.parent.next(params)? { |                     Some(CriterionResult { | ||||||
|                         Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { |                         query_tree: Some(query_tree), | ||||||
|                             let mut candidates = match candidates { |                         candidates, | ||||||
|                                 Some(candidates) => candidates, |                         filtered_candidates, | ||||||
|                                 None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, |                         bucket_candidates, | ||||||
|                             }; |                     }) => { | ||||||
|  |                         let mut candidates = match candidates { | ||||||
|                             if let Some(filtered_candidates) = filtered_candidates { |                             Some(candidates) => candidates, | ||||||
|                                 candidates &= filtered_candidates; |                             None => { | ||||||
|  |                                 resolve_query_tree(self.ctx, &query_tree, params.wdcache)? | ||||||
|  |                                     - params.excluded_candidates | ||||||
|                             } |                             } | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|                             match bucket_candidates { |                         if let Some(filtered_candidates) = filtered_candidates { | ||||||
|                                 Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, |                             candidates &= filtered_candidates; | ||||||
|                                 None => self.bucket_candidates |= &candidates, |                         } | ||||||
|                             } |  | ||||||
|  |  | ||||||
|                             let maximum_proximity = maximum_proximity(&query_tree); |                         match bucket_candidates { | ||||||
|                             self.state = Some((maximum_proximity as u8, query_tree, candidates)); |                             Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, | ||||||
|                             self.proximity = 0; |                             None => self.bucket_candidates |= &candidates, | ||||||
|                             self.plane_sweep_cache = None; |                         } | ||||||
|                         }, |  | ||||||
|                         Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { |                         let maximum_proximity = maximum_proximity(&query_tree); | ||||||
|                             return Ok(Some(CriterionResult { |                         self.state = Some((maximum_proximity as u8, query_tree, candidates)); | ||||||
|                                 query_tree: None, |                         self.proximity = 0; | ||||||
|                                 candidates, |                         self.plane_sweep_cache = None; | ||||||
|                                 filtered_candidates, |  | ||||||
|                                 bucket_candidates, |  | ||||||
|                             })); |  | ||||||
|                         }, |  | ||||||
|                         None => return Ok(None), |  | ||||||
|                     } |                     } | ||||||
|  |                     Some(CriterionResult { | ||||||
|  |                         query_tree: None, | ||||||
|  |                         candidates, | ||||||
|  |                         filtered_candidates, | ||||||
|  |                         bucket_candidates, | ||||||
|  |                     }) => { | ||||||
|  |                         return Ok(Some(CriterionResult { | ||||||
|  |                             query_tree: None, | ||||||
|  |                             candidates, | ||||||
|  |                             filtered_candidates, | ||||||
|  |                             bucket_candidates, | ||||||
|  |                         })); | ||||||
|  |                     } | ||||||
|  |                     None => return Ok(None), | ||||||
|                 }, |                 }, | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -162,46 +173,48 @@ fn resolve_candidates<'t>( | |||||||
|     proximity: u8, |     proximity: u8, | ||||||
|     cache: &mut Cache, |     cache: &mut Cache, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<RoaringBitmap> | ) -> Result<RoaringBitmap> { | ||||||
| { |  | ||||||
|     fn resolve_operation<'t>( |     fn resolve_operation<'t>( | ||||||
|         ctx: &'t dyn Context, |         ctx: &'t dyn Context, | ||||||
|         query_tree: &Operation, |         query_tree: &Operation, | ||||||
|         proximity: u8, |         proximity: u8, | ||||||
|         cache: &mut Cache, |         cache: &mut Cache, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<Vec<(Query, Query, RoaringBitmap)>> |     ) -> Result<Vec<(Query, Query, RoaringBitmap)>> { | ||||||
|     { |         use Operation::{And, Or, Phrase}; | ||||||
|         use Operation::{And, Phrase, Or}; |  | ||||||
|  |  | ||||||
|         let result = match query_tree { |         let result = match query_tree { | ||||||
|             And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, |             And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, | ||||||
|             Phrase(words) => if proximity == 0 { |             Phrase(words) => { | ||||||
|                 let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); |                 if proximity == 0 { | ||||||
|                 let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); |                     let most_left = words | ||||||
|                 let mut candidates = None; |                         .first() | ||||||
|                 for slice in words.windows(2) { |                         .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); | ||||||
|                     let (left, right) = (&slice[0], &slice[1]); |                     let most_right = words | ||||||
|                     match ctx.word_pair_proximity_docids(left, right, 1)? { |                         .last() | ||||||
|                         Some(pair_docids) => { |                         .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); | ||||||
|                             match candidates.as_mut() { |                     let mut candidates = None; | ||||||
|  |                     for slice in words.windows(2) { | ||||||
|  |                         let (left, right) = (&slice[0], &slice[1]); | ||||||
|  |                         match ctx.word_pair_proximity_docids(left, right, 1)? { | ||||||
|  |                             Some(pair_docids) => match candidates.as_mut() { | ||||||
|                                 Some(candidates) => *candidates &= pair_docids, |                                 Some(candidates) => *candidates &= pair_docids, | ||||||
|                                 None => candidates = Some(pair_docids), |                                 None => candidates = Some(pair_docids), | ||||||
|  |                             }, | ||||||
|  |                             None => { | ||||||
|  |                                 candidates = None; | ||||||
|  |                                 break; | ||||||
|                             } |                             } | ||||||
|                         }, |  | ||||||
|                         None => { |  | ||||||
|                             candidates = None; |  | ||||||
|                             break; |  | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|  |                     match (most_left, most_right, candidates) { | ||||||
|  |                         (Some(l), Some(r), Some(c)) => vec![(l, r, c)], | ||||||
|  |                         _otherwise => Default::default(), | ||||||
|  |                     } | ||||||
|  |                 } else { | ||||||
|  |                     Default::default() | ||||||
|                 } |                 } | ||||||
|                 match (most_left, most_right, candidates) { |             } | ||||||
|                     (Some(l), Some(r), Some(c)) => vec![(l, r, c)], |  | ||||||
|                     _otherwise => Default::default(), |  | ||||||
|                 } |  | ||||||
|             } else { |  | ||||||
|                 Default::default() |  | ||||||
|             }, |  | ||||||
|             Or(_, ops) => { |             Or(_, ops) => { | ||||||
|                 let mut output = Vec::new(); |                 let mut output = Vec::new(); | ||||||
|                 for op in ops { |                 for op in ops { | ||||||
| @@ -209,13 +222,15 @@ fn resolve_candidates<'t>( | |||||||
|                     output.extend(result); |                     output.extend(result); | ||||||
|                 } |                 } | ||||||
|                 output |                 output | ||||||
|             }, |             } | ||||||
|             Operation::Query(q) => if proximity == 0 { |             Operation::Query(q) => { | ||||||
|                 let candidates = query_docids(ctx, q, wdcache)?; |                 if proximity == 0 { | ||||||
|                 vec![(q.clone(), q.clone(), candidates)] |                     let candidates = query_docids(ctx, q, wdcache)?; | ||||||
|             } else { |                     vec![(q.clone(), q.clone(), candidates)] | ||||||
|                 Default::default() |                 } else { | ||||||
|             }, |                     Default::default() | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         Ok(result) |         Ok(result) | ||||||
| @@ -228,8 +243,7 @@ fn resolve_candidates<'t>( | |||||||
|         proximity: u8, |         proximity: u8, | ||||||
|         cache: &mut Cache, |         cache: &mut Cache, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<Vec<(Query, Query, RoaringBitmap)>> |     ) -> Result<Vec<(Query, Query, RoaringBitmap)>> { | ||||||
|     { |  | ||||||
|         fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> { |         fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> { | ||||||
|             (0..=mana.min(left_max)).map(move |m| (m, mana - m)) |             (0..=mana.min(left_max)).map(move |m| (m, mana - m)) | ||||||
|         } |         } | ||||||
| @@ -257,7 +271,8 @@ fn resolve_candidates<'t>( | |||||||
|  |  | ||||||
|                 for (ll, lr, lcandidates) in lefts { |                 for (ll, lr, lcandidates) in lefts { | ||||||
|                     for (rl, rr, rcandidates) in rights { |                     for (rl, rr, rcandidates) in rights { | ||||||
|                         let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; |                         let mut candidates = | ||||||
|  |                             query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; | ||||||
|                         if lcandidates.len() < rcandidates.len() { |                         if lcandidates.len() < rcandidates.len() { | ||||||
|                             candidates.intersect_with(lcandidates); |                             candidates.intersect_with(lcandidates); | ||||||
|                             candidates.intersect_with(rcandidates); |                             candidates.intersect_with(rcandidates); | ||||||
| @@ -282,22 +297,26 @@ fn resolve_candidates<'t>( | |||||||
|         proximity: u8, |         proximity: u8, | ||||||
|         cache: &mut Cache, |         cache: &mut Cache, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<Vec<(Query, Query, RoaringBitmap)>> |     ) -> Result<Vec<(Query, Query, RoaringBitmap)>> { | ||||||
|     { |  | ||||||
|         // Extract the first two elements but gives the tail |         // Extract the first two elements but gives the tail | ||||||
|         // that is just after the first element. |         // that is just after the first element. | ||||||
|         let next = branches.split_first().map(|(h1, t)| { |         let next = | ||||||
|             (h1, t.split_first().map(|(h2, _)| (h2, t))) |             branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t)))); | ||||||
|         }); |  | ||||||
|  |  | ||||||
|         match next { |         match next { | ||||||
|             Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache), |             Some((head1, Some((head2, [_])))) => { | ||||||
|  |                 mdfs_pair(ctx, head1, head2, proximity, cache, wdcache) | ||||||
|  |             } | ||||||
|             Some((head1, Some((head2, tail)))) => { |             Some((head1, Some((head2, tail)))) => { | ||||||
|                 let mut output = Vec::new(); |                 let mut output = Vec::new(); | ||||||
|                 for p in 0..=proximity { |                 for p in 0..=proximity { | ||||||
|                     for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? { |                     for (lhead, _, head_candidates) in | ||||||
|  |                         mdfs_pair(ctx, head1, head2, p, cache, wdcache)? | ||||||
|  |                     { | ||||||
|                         if !head_candidates.is_empty() { |                         if !head_candidates.is_empty() { | ||||||
|                             for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? { |                             for (_, rtail, mut candidates) in | ||||||
|  |                                 mdfs(ctx, tail, proximity - p, cache, wdcache)? | ||||||
|  |                             { | ||||||
|                                 candidates.intersect_with(&head_candidates); |                                 candidates.intersect_with(&head_candidates); | ||||||
|                                 if !candidates.is_empty() { |                                 if !candidates.is_empty() { | ||||||
|                                     output.push((lhead.clone(), rtail, candidates)); |                                     output.push((lhead.clone(), rtail, candidates)); | ||||||
| @@ -307,7 +326,7 @@ fn resolve_candidates<'t>( | |||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Ok(output) |                 Ok(output) | ||||||
|             }, |             } | ||||||
|             Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), |             Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), | ||||||
|             None => Ok(Default::default()), |             None => Ok(Default::default()), | ||||||
|         } |         } | ||||||
| @@ -325,47 +344,48 @@ fn resolve_plane_sweep_candidates( | |||||||
|     query_tree: &Operation, |     query_tree: &Operation, | ||||||
|     allowed_candidates: &RoaringBitmap, |     allowed_candidates: &RoaringBitmap, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<BTreeMap<u8, RoaringBitmap>> | ) -> Result<BTreeMap<u8, RoaringBitmap>> { | ||||||
| { |  | ||||||
|     /// FIXME may be buggy with query like "new new york" |     /// FIXME may be buggy with query like "new new york" | ||||||
|     fn plane_sweep( |     fn plane_sweep( | ||||||
|         groups_positions: Vec<Vec<(Position, u8, Position)>>, |         groups_positions: Vec<Vec<(Position, u8, Position)>>, | ||||||
|         consecutive: bool, |         consecutive: bool, | ||||||
|     ) -> Result<Vec<(Position, u8, Position)>> |     ) -> Result<Vec<(Position, u8, Position)>> { | ||||||
|     { |  | ||||||
|         fn compute_groups_proximity( |         fn compute_groups_proximity( | ||||||
|             groups: &[(usize, (Position, u8, Position))], |             groups: &[(usize, (Position, u8, Position))], | ||||||
|             consecutive: bool, |             consecutive: bool, | ||||||
|         ) -> Option<(Position, u8, Position)> |         ) -> Option<(Position, u8, Position)> { | ||||||
|         { |  | ||||||
|             // take the inner proximity of the first group as initial |             // take the inner proximity of the first group as initial | ||||||
|             let (_, (_, mut proximity, _)) = groups.first()?; |             let (_, (_, mut proximity, _)) = groups.first()?; | ||||||
|             let (_, (left_most_pos, _, _)) = groups.first()?; |             let (_, (left_most_pos, _, _)) = groups.first()?; | ||||||
|             let (_, (_, _, right_most_pos)) = groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; |             let (_, (_, _, right_most_pos)) = | ||||||
|  |                 groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; | ||||||
|  |  | ||||||
|             for pair in groups.windows(2) { |             for pair in groups.windows(2) { | ||||||
|                 if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { |                 if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { | ||||||
|                     // if two positions are equal, meaning that they share at least a word, we return None |                     // if two positions are equal, meaning that they share at least a word, we return None | ||||||
|                     if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { |                     if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { | ||||||
|                         return None |                         return None; | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     let pair_proximity = { |                     let pair_proximity = { | ||||||
|                         // if intervals are disjoint [..].(..) |                         // if intervals are disjoint [..].(..) | ||||||
|                         if lpos2 > rpos1 { lpos2 - rpos1 } |                         if lpos2 > rpos1 { | ||||||
|  |                             lpos2 - rpos1 | ||||||
|  |                         } | ||||||
|                         // if the second interval is a subset of the first [.(..).] |                         // if the second interval is a subset of the first [.(..).] | ||||||
|                         else if rpos2 < rpos1 { (lpos2 - lpos1).min(rpos1 - rpos2) } |                         else if rpos2 < rpos1 { | ||||||
|  |                             (lpos2 - lpos1).min(rpos1 - rpos2) | ||||||
|  |                         } | ||||||
|                         // if intervals overlaps [.(..].) |                         // if intervals overlaps [.(..].) | ||||||
|                         else { (lpos2 - lpos1).min(rpos2 - rpos1) } |                         else { | ||||||
|  |                             (lpos2 - lpos1).min(rpos2 - rpos1) | ||||||
|  |                         } | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
|                     // if groups are in the good order (query order) we remove 1 to the proximity |                     // if groups are in the good order (query order) we remove 1 to the proximity | ||||||
|                     // the proximity is clamped to 7 |                     // the proximity is clamped to 7 | ||||||
|                     let pair_proximity = if i1 < i2 { |                     let pair_proximity = | ||||||
|                         (pair_proximity - 1).min(7) |                         if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) }; | ||||||
|                     } else { |  | ||||||
|                         pair_proximity.min(7) |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     proximity += pair_proximity as u8 + prox2; |                     proximity += pair_proximity as u8 + prox2; | ||||||
|                 } |                 } | ||||||
| @@ -381,7 +401,8 @@ fn resolve_plane_sweep_candidates( | |||||||
|  |  | ||||||
|         let groups_len = groups_positions.len(); |         let groups_len = groups_positions.len(); | ||||||
|  |  | ||||||
|         let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); |         let mut groups_positions: Vec<_> = | ||||||
|  |             groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); | ||||||
|  |  | ||||||
|         // Pop top elements of each list. |         // Pop top elements of each list. | ||||||
|         let mut current = Vec::with_capacity(groups_len); |         let mut current = Vec::with_capacity(groups_len); | ||||||
| @@ -452,9 +473,8 @@ fn resolve_plane_sweep_candidates( | |||||||
|         rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, |         rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, | ||||||
|         words_positions: &HashMap<String, RoaringBitmap>, |         words_positions: &HashMap<String, RoaringBitmap>, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<Vec<(Position, u8, Position)>> |     ) -> Result<Vec<(Position, u8, Position)>> { | ||||||
|     { |         use Operation::{And, Or, Phrase}; | ||||||
|         use Operation::{And, Phrase, Or}; |  | ||||||
|  |  | ||||||
|         if let Some(result) = rocache.get(query_tree) { |         if let Some(result) = rocache.get(query_tree) { | ||||||
|             return Ok(result.clone()); |             return Ok(result.clone()); | ||||||
| @@ -462,13 +482,20 @@ fn resolve_plane_sweep_candidates( | |||||||
|  |  | ||||||
|         let result = match query_tree { |         let result = match query_tree { | ||||||
|             And(ops) => { |             And(ops) => { | ||||||
|                  let mut groups_positions = Vec::with_capacity(ops.len()); |                 let mut groups_positions = Vec::with_capacity(ops.len()); | ||||||
|                 for operation in ops { |                 for operation in ops { | ||||||
|                     let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; |                     let positions = resolve_operation( | ||||||
|  |                         ctx, | ||||||
|  |                         operation, | ||||||
|  |                         docid, | ||||||
|  |                         rocache, | ||||||
|  |                         words_positions, | ||||||
|  |                         wdcache, | ||||||
|  |                     )?; | ||||||
|                     groups_positions.push(positions); |                     groups_positions.push(positions); | ||||||
|                 } |                 } | ||||||
|                 plane_sweep(groups_positions, false)? |                 plane_sweep(groups_positions, false)? | ||||||
|             }, |             } | ||||||
|             Phrase(words) => { |             Phrase(words) => { | ||||||
|                 let mut groups_positions = Vec::with_capacity(words.len()); |                 let mut groups_positions = Vec::with_capacity(words.len()); | ||||||
|                 for word in words { |                 for word in words { | ||||||
| @@ -479,16 +506,23 @@ fn resolve_plane_sweep_candidates( | |||||||
|                     groups_positions.push(positions); |                     groups_positions.push(positions); | ||||||
|                 } |                 } | ||||||
|                 plane_sweep(groups_positions, true)? |                 plane_sweep(groups_positions, true)? | ||||||
|             }, |             } | ||||||
|             Or(_, ops) => { |             Or(_, ops) => { | ||||||
|                 let mut result = Vec::new(); |                 let mut result = Vec::new(); | ||||||
|                 for op in ops { |                 for op in ops { | ||||||
|                     result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) |                     result.extend(resolve_operation( | ||||||
|  |                         ctx, | ||||||
|  |                         op, | ||||||
|  |                         docid, | ||||||
|  |                         rocache, | ||||||
|  |                         words_positions, | ||||||
|  |                         wdcache, | ||||||
|  |                     )?) | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 result.sort_unstable(); |                 result.sort_unstable(); | ||||||
|                 result |                 result | ||||||
|             }, |             } | ||||||
|             Operation::Query(Query { prefix, kind }) => { |             Operation::Query(Query { prefix, kind }) => { | ||||||
|                 let mut result = Vec::new(); |                 let mut result = Vec::new(); | ||||||
|                 match kind { |                 match kind { | ||||||
| @@ -498,9 +532,9 @@ fn resolve_plane_sweep_candidates( | |||||||
|                                 .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); |                                 .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); | ||||||
|                             result.extend(iter); |                             result.extend(iter); | ||||||
|                         } else if let Some(positions) = words_positions.get(word) { |                         } else if let Some(positions) = words_positions.get(word) { | ||||||
|                                 result.extend(positions.iter().map(|p| (p, 0, p))); |                             result.extend(positions.iter().map(|p| (p, 0, p))); | ||||||
|                         } |                         } | ||||||
|                     }, |                     } | ||||||
|                     QueryKind::Tolerant { typo, word } => { |                     QueryKind::Tolerant { typo, word } => { | ||||||
|                         let iter = word_derivations(word, *prefix, *typo, &words_positions) |                         let iter = word_derivations(word, *prefix, *typo, &words_positions) | ||||||
|                             .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); |                             .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); | ||||||
| @@ -522,8 +556,7 @@ fn resolve_plane_sweep_candidates( | |||||||
|         is_prefix: bool, |         is_prefix: bool, | ||||||
|         max_typo: u8, |         max_typo: u8, | ||||||
|         words_positions: &'a HashMap<String, RoaringBitmap>, |         words_positions: &'a HashMap<String, RoaringBitmap>, | ||||||
|     ) -> impl Iterator<Item = &'a RoaringBitmap> |     ) -> impl Iterator<Item = &'a RoaringBitmap> { | ||||||
|     { |  | ||||||
|         let dfa = build_dfa(word, max_typo, is_prefix); |         let dfa = build_dfa(word, max_typo, is_prefix); | ||||||
|         words_positions.iter().filter_map(move |(document_word, positions)| { |         words_positions.iter().filter_map(move |(document_word, positions)| { | ||||||
|             use levenshtein_automata::Distance; |             use levenshtein_automata::Distance; | ||||||
| @@ -539,7 +572,7 @@ fn resolve_plane_sweep_candidates( | |||||||
|     for docid in allowed_candidates { |     for docid in allowed_candidates { | ||||||
|         let words_positions = ctx.docid_words_positions(docid)?; |         let words_positions = ctx.docid_words_positions(docid)?; | ||||||
|         resolve_operation_cache.clear(); |         resolve_operation_cache.clear(); | ||||||
|         let positions =  resolve_operation( |         let positions = resolve_operation( | ||||||
|             ctx, |             ctx, | ||||||
|             query_tree, |             query_tree, | ||||||
|             docid, |             docid, | ||||||
|   | |||||||
| @@ -1,20 +1,17 @@ | |||||||
| use std::{borrow::Cow, collections::HashMap, mem::take}; | use std::borrow::Cow; | ||||||
|  | use std::collections::HashMap; | ||||||
|  | use std::mem::take; | ||||||
|  |  | ||||||
| use log::debug; | use log::debug; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use super::{ | ||||||
|  |     query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, | ||||||
|  |     CriterionResult, | ||||||
|  | }; | ||||||
| use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; | use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; | ||||||
| use crate::search::{word_derivations, WordDerivationsCache}; | use crate::search::{word_derivations, WordDerivationsCache}; | ||||||
| use crate::Result; | use crate::Result; | ||||||
| use super::{ |  | ||||||
|     Candidates, |  | ||||||
|     Context, |  | ||||||
|     Criterion, |  | ||||||
|     CriterionParameters, |  | ||||||
|     CriterionResult, |  | ||||||
|     query_docids, |  | ||||||
|     resolve_query_tree, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| /// Maximum number of typo for a word of any length. | /// Maximum number of typo for a word of any length. | ||||||
| const MAX_TYPOS_PER_WORD: u8 = 2; | const MAX_TYPOS_PER_WORD: u8 = 2; | ||||||
| @@ -54,7 +51,8 @@ impl<'t> Criterion for Typo<'t> { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         loop { |         loop { | ||||||
|             debug!("Typo at iteration {} (max typos {:?}) ({:?})", |             debug!( | ||||||
|  |                 "Typo at iteration {} (max typos {:?}) ({:?})", | ||||||
|                 self.typos, |                 self.typos, | ||||||
|                 self.state.as_ref().map(|(mt, _, _)| mt), |                 self.state.as_ref().map(|(mt, _, _)| mt), | ||||||
|                 self.state.as_ref().map(|(_, _, cd)| cd), |                 self.state.as_ref().map(|(_, _, cd)| cd), | ||||||
| @@ -63,29 +61,42 @@ impl<'t> Criterion for Typo<'t> { | |||||||
|             match self.state.as_mut() { |             match self.state.as_mut() { | ||||||
|                 Some((max_typos, _, _)) if self.typos > *max_typos => { |                 Some((max_typos, _, _)) if self.typos > *max_typos => { | ||||||
|                     self.state = None; // reset state |                     self.state = None; // reset state | ||||||
|                 }, |                 } | ||||||
|                 Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { |                 Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { | ||||||
|                     self.state = None; // reset state |                     self.state = None; // reset state | ||||||
|                 }, |                 } | ||||||
|                 Some((_, query_tree, candidates_authorization)) => { |                 Some((_, query_tree, candidates_authorization)) => { | ||||||
|                     let fst = self.ctx.words_fst(); |                     let fst = self.ctx.words_fst(); | ||||||
|                     let new_query_tree = match self.typos { |                     let new_query_tree = match self.typos { | ||||||
|                         typos if typos < MAX_TYPOS_PER_WORD => { |                         typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( | ||||||
|                             alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? |                             &fst, | ||||||
|                         }, |                             query_tree.clone(), | ||||||
|  |                             self.typos, | ||||||
|  |                             params.wdcache, | ||||||
|  |                         )?, | ||||||
|                         MAX_TYPOS_PER_WORD => { |                         MAX_TYPOS_PER_WORD => { | ||||||
|                             // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, |                             // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, | ||||||
|                             // we keep the altered query tree |                             // we keep the altered query tree | ||||||
|                             *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; |                             *query_tree = alterate_query_tree( | ||||||
|  |                                 &fst, | ||||||
|  |                                 query_tree.clone(), | ||||||
|  |                                 self.typos, | ||||||
|  |                                 params.wdcache, | ||||||
|  |                             )?; | ||||||
|                             // we compute the allowed candidates |                             // we compute the allowed candidates | ||||||
|                             let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; |                             let query_tree_allowed_candidates = | ||||||
|  |                                 resolve_query_tree(self.ctx, query_tree, params.wdcache)?; | ||||||
|                             // we assign the allowed candidates to the candidates authorization. |                             // we assign the allowed candidates to the candidates authorization. | ||||||
|                             *candidates_authorization = match take(candidates_authorization) { |                             *candidates_authorization = match take(candidates_authorization) { | ||||||
|                                 Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), |                                 Allowed(allowed_candidates) => { | ||||||
|                                 Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), |                                     Allowed(query_tree_allowed_candidates & allowed_candidates) | ||||||
|  |                                 } | ||||||
|  |                                 Forbidden(forbidden_candidates) => { | ||||||
|  |                                     Allowed(query_tree_allowed_candidates - forbidden_candidates) | ||||||
|  |                                 } | ||||||
|                             }; |                             }; | ||||||
|                             query_tree.clone() |                             query_tree.clone() | ||||||
|                         }, |                         } | ||||||
|                         _otherwise => query_tree.clone(), |                         _otherwise => query_tree.clone(), | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
| @@ -101,11 +112,11 @@ impl<'t> Criterion for Typo<'t> { | |||||||
|                         Allowed(allowed_candidates) => { |                         Allowed(allowed_candidates) => { | ||||||
|                             candidates &= &*allowed_candidates; |                             candidates &= &*allowed_candidates; | ||||||
|                             *allowed_candidates -= &candidates; |                             *allowed_candidates -= &candidates; | ||||||
|                         }, |                         } | ||||||
|                         Forbidden(forbidden_candidates) => { |                         Forbidden(forbidden_candidates) => { | ||||||
|                             candidates -= &*forbidden_candidates; |                             candidates -= &*forbidden_candidates; | ||||||
|                             *forbidden_candidates |= &candidates; |                             *forbidden_candidates |= &candidates; | ||||||
|                         }, |                         } | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     let bucket_candidates = match self.bucket_candidates.as_mut() { |                     let bucket_candidates = match self.bucket_candidates.as_mut() { | ||||||
| @@ -121,35 +132,45 @@ impl<'t> Criterion for Typo<'t> { | |||||||
|                         filtered_candidates: None, |                         filtered_candidates: None, | ||||||
|                         bucket_candidates: Some(bucket_candidates), |                         bucket_candidates: Some(bucket_candidates), | ||||||
|                     })); |                     })); | ||||||
|                 }, |                 } | ||||||
|                 None => { |                 None => match self.parent.next(params)? { | ||||||
|                     match self.parent.next(params)? { |                     Some(CriterionResult { | ||||||
|                         Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { |                         query_tree: Some(query_tree), | ||||||
|                             self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { |                         candidates, | ||||||
|  |                         filtered_candidates, | ||||||
|  |                         bucket_candidates, | ||||||
|  |                     }) => { | ||||||
|  |                         self.bucket_candidates = | ||||||
|  |                             match (self.bucket_candidates.take(), bucket_candidates) { | ||||||
|                                 (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), |                                 (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), | ||||||
|                                 (self_bc, parent_bc) => self_bc.or(parent_bc), |                                 (self_bc, parent_bc) => self_bc.or(parent_bc), | ||||||
|                             }; |                             }; | ||||||
|  |  | ||||||
|                             let candidates = match candidates.or(filtered_candidates) { |                         let candidates = match candidates.or(filtered_candidates) { | ||||||
|                                 Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates), |                             Some(candidates) => { | ||||||
|                                 None => Candidates::Forbidden(params.excluded_candidates.clone()), |                                 Candidates::Allowed(candidates - params.excluded_candidates) | ||||||
|                             }; |                             } | ||||||
|  |                             None => Candidates::Forbidden(params.excluded_candidates.clone()), | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|                             let maximum_typos = maximum_typo(&query_tree) as u8; |                         let maximum_typos = maximum_typo(&query_tree) as u8; | ||||||
|                             self.state = Some((maximum_typos, query_tree, candidates)); |                         self.state = Some((maximum_typos, query_tree, candidates)); | ||||||
|                             self.typos = 0; |                         self.typos = 0; | ||||||
|  |  | ||||||
|                         }, |  | ||||||
|                         Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { |  | ||||||
|                             return Ok(Some(CriterionResult { |  | ||||||
|                                 query_tree: None, |  | ||||||
|                                 candidates, |  | ||||||
|                                 filtered_candidates, |  | ||||||
|                                 bucket_candidates, |  | ||||||
|                             })); |  | ||||||
|                         }, |  | ||||||
|                         None => return Ok(None), |  | ||||||
|                     } |                     } | ||||||
|  |                     Some(CriterionResult { | ||||||
|  |                         query_tree: None, | ||||||
|  |                         candidates, | ||||||
|  |                         filtered_candidates, | ||||||
|  |                         bucket_candidates, | ||||||
|  |                     }) => { | ||||||
|  |                         return Ok(Some(CriterionResult { | ||||||
|  |                             query_tree: None, | ||||||
|  |                             candidates, | ||||||
|  |                             filtered_candidates, | ||||||
|  |                             bucket_candidates, | ||||||
|  |                         })); | ||||||
|  |                     } | ||||||
|  |                     None => return Ok(None), | ||||||
|                 }, |                 }, | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -164,21 +185,19 @@ fn alterate_query_tree( | |||||||
|     mut query_tree: Operation, |     mut query_tree: Operation, | ||||||
|     number_typos: u8, |     number_typos: u8, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<Operation> | ) -> Result<Operation> { | ||||||
| { |  | ||||||
|     fn recurse( |     fn recurse( | ||||||
|         words_fst: &fst::Set<Cow<[u8]>>, |         words_fst: &fst::Set<Cow<[u8]>>, | ||||||
|         operation: &mut Operation, |         operation: &mut Operation, | ||||||
|         number_typos: u8, |         number_typos: u8, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<()> |     ) -> Result<()> { | ||||||
|     { |         use Operation::{And, Or, Phrase}; | ||||||
|         use Operation::{And, Phrase, Or}; |  | ||||||
|  |  | ||||||
|         match operation { |         match operation { | ||||||
|             And(ops) | Or(_, ops) => { |             And(ops) | Or(_, ops) => { | ||||||
|                 ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) |                 ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) | ||||||
|             }, |             } | ||||||
|             // Because Phrases don't allow typos, no alteration can be done. |             // Because Phrases don't allow typos, no alteration can be done. | ||||||
|             Phrase(_words) => return Ok(()), |             Phrase(_words) => return Ok(()), | ||||||
|             Operation::Query(q) => { |             Operation::Query(q) => { | ||||||
| @@ -193,19 +212,25 @@ fn alterate_query_tree( | |||||||
|                     } else { |                     } else { | ||||||
|                         let typo = *typo.min(&number_typos); |                         let typo = *typo.min(&number_typos); | ||||||
|                         let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; |                         let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; | ||||||
|                         let queries = words.iter().map(|(word, typo)| { |                         let queries = words | ||||||
|                             Operation::Query(Query { |                             .iter() | ||||||
|                                 prefix: false, |                             .map(|(word, typo)| { | ||||||
|                                 kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, |                                 Operation::Query(Query { | ||||||
|  |                                     prefix: false, | ||||||
|  |                                     kind: QueryKind::Exact { | ||||||
|  |                                         original_typo: *typo, | ||||||
|  |                                         word: word.to_string(), | ||||||
|  |                                     }, | ||||||
|  |                                 }) | ||||||
|                             }) |                             }) | ||||||
|                         }).collect(); |                             .collect(); | ||||||
|  |  | ||||||
|                         *operation = Operation::or(false, queries); |                         *operation = Operation::or(false, queries); | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 Ok(()) |                 Ok(()) | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -219,22 +244,18 @@ fn resolve_candidates<'t>( | |||||||
|     number_typos: u8, |     number_typos: u8, | ||||||
|     cache: &mut HashMap<(Operation, u8), RoaringBitmap>, |     cache: &mut HashMap<(Operation, u8), RoaringBitmap>, | ||||||
|     wdcache: &mut WordDerivationsCache, |     wdcache: &mut WordDerivationsCache, | ||||||
| ) -> Result<RoaringBitmap> | ) -> Result<RoaringBitmap> { | ||||||
| { |  | ||||||
|     fn resolve_operation<'t>( |     fn resolve_operation<'t>( | ||||||
|         ctx: &'t dyn Context, |         ctx: &'t dyn Context, | ||||||
|         query_tree: &Operation, |         query_tree: &Operation, | ||||||
|         number_typos: u8, |         number_typos: u8, | ||||||
|         cache: &mut HashMap<(Operation, u8), RoaringBitmap>, |         cache: &mut HashMap<(Operation, u8), RoaringBitmap>, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<RoaringBitmap> |     ) -> Result<RoaringBitmap> { | ||||||
|     { |         use Operation::{And, Or, Phrase, Query}; | ||||||
|         use Operation::{And, Phrase, Or, Query}; |  | ||||||
|  |  | ||||||
|         match query_tree { |         match query_tree { | ||||||
|             And(ops) => { |             And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), | ||||||
|                 mdfs(ctx, ops, number_typos, cache, wdcache) |  | ||||||
|             }, |  | ||||||
|             Phrase(words) => { |             Phrase(words) => { | ||||||
|                 let mut candidates = RoaringBitmap::new(); |                 let mut candidates = RoaringBitmap::new(); | ||||||
|                 let mut first_loop = true; |                 let mut first_loop = true; | ||||||
| @@ -250,12 +271,12 @@ fn resolve_candidates<'t>( | |||||||
|                             } else { |                             } else { | ||||||
|                                 candidates &= pair_docids; |                                 candidates &= pair_docids; | ||||||
|                             } |                             } | ||||||
|                         }, |                         } | ||||||
|                         None => return Ok(RoaringBitmap::new()) |                         None => return Ok(RoaringBitmap::new()), | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 Ok(candidates) |                 Ok(candidates) | ||||||
|             }, |             } | ||||||
|             Or(_, ops) => { |             Or(_, ops) => { | ||||||
|                 let mut candidates = RoaringBitmap::new(); |                 let mut candidates = RoaringBitmap::new(); | ||||||
|                 for op in ops { |                 for op in ops { | ||||||
| @@ -263,12 +284,14 @@ fn resolve_candidates<'t>( | |||||||
|                     candidates.union_with(&docids); |                     candidates.union_with(&docids); | ||||||
|                 } |                 } | ||||||
|                 Ok(candidates) |                 Ok(candidates) | ||||||
|             }, |             } | ||||||
|             Query(q) => if q.kind.typo() == number_typos { |             Query(q) => { | ||||||
|                 Ok(query_docids(ctx, q, wdcache)?) |                 if q.kind.typo() == number_typos { | ||||||
|             } else { |                     Ok(query_docids(ctx, q, wdcache)?) | ||||||
|                 Ok(RoaringBitmap::new()) |                 } else { | ||||||
|             }, |                     Ok(RoaringBitmap::new()) | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -278,8 +301,7 @@ fn resolve_candidates<'t>( | |||||||
|         mana: u8, |         mana: u8, | ||||||
|         cache: &mut HashMap<(Operation, u8), RoaringBitmap>, |         cache: &mut HashMap<(Operation, u8), RoaringBitmap>, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> Result<RoaringBitmap> |     ) -> Result<RoaringBitmap> { | ||||||
|     { |  | ||||||
|         match branches.split_first() { |         match branches.split_first() { | ||||||
|             Some((head, [])) => { |             Some((head, [])) => { | ||||||
|                 let cache_key = (head.clone(), mana); |                 let cache_key = (head.clone(), mana); | ||||||
| @@ -290,7 +312,7 @@ fn resolve_candidates<'t>( | |||||||
|                     cache.insert(cache_key, candidates.clone()); |                     cache.insert(cache_key, candidates.clone()); | ||||||
|                     Ok(candidates) |                     Ok(candidates) | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             Some((head, tail)) => { |             Some((head, tail)) => { | ||||||
|                 let mut candidates = RoaringBitmap::new(); |                 let mut candidates = RoaringBitmap::new(); | ||||||
|  |  | ||||||
| @@ -313,7 +335,7 @@ fn resolve_candidates<'t>( | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 Ok(candidates) |                 Ok(candidates) | ||||||
|             }, |             } | ||||||
|             None => Ok(RoaringBitmap::new()), |             None => Ok(RoaringBitmap::new()), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -323,9 +345,9 @@ fn resolve_candidates<'t>( | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod test { | mod test { | ||||||
|     use super::*; |  | ||||||
|     use super::super::initial::Initial; |     use super::super::initial::Initial; | ||||||
|     use super::super::test::TestContext; |     use super::super::test::TestContext; | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn initial_placeholder_no_facets() { |     fn initial_placeholder_no_facets() { | ||||||
| @@ -348,13 +370,23 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn initial_query_tree_no_facets() { |     fn initial_query_tree_no_facets() { | ||||||
|         let context = TestContext::default(); |         let context = TestContext::default(); | ||||||
|         let query_tree = Operation::Or(false, vec![ |         let query_tree = Operation::Or( | ||||||
|             Operation::And(vec![ |             false, | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), |             vec![Operation::And(vec![ | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), |                 Operation::Query(Query { | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), |                     prefix: false, | ||||||
|             ]) |                     kind: QueryKind::exact("split".to_string()), | ||||||
|         ]); |                 }), | ||||||
|  |                 Operation::Query(Query { | ||||||
|  |                     prefix: false, | ||||||
|  |                     kind: QueryKind::exact("this".to_string()), | ||||||
|  |                 }), | ||||||
|  |                 Operation::Query(Query { | ||||||
|  |                     prefix: false, | ||||||
|  |                     kind: QueryKind::tolerant(1, "world".to_string()), | ||||||
|  |                 }), | ||||||
|  |             ])], | ||||||
|  |         ); | ||||||
|  |  | ||||||
|         let facet_candidates = None; |         let facet_candidates = None; | ||||||
|  |  | ||||||
| @@ -369,13 +401,23 @@ mod test { | |||||||
|             & context.word_docids("this").unwrap().unwrap() |             & context.word_docids("this").unwrap().unwrap() | ||||||
|             & context.word_docids("world").unwrap().unwrap(); |             & context.word_docids("world").unwrap().unwrap(); | ||||||
|         let expected_1 = CriterionResult { |         let expected_1 = CriterionResult { | ||||||
|             query_tree: Some(Operation::Or(false, vec![ |             query_tree: Some(Operation::Or( | ||||||
|                 Operation::And(vec![ |                 false, | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), |                 vec![Operation::And(vec![ | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), |                     Operation::Query(Query { | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), |                         prefix: false, | ||||||
|                 ]), |                         kind: QueryKind::exact("split".to_string()), | ||||||
|             ])), |                     }), | ||||||
|  |                     Operation::Query(Query { | ||||||
|  |                         prefix: false, | ||||||
|  |                         kind: QueryKind::exact("this".to_string()), | ||||||
|  |                     }), | ||||||
|  |                     Operation::Query(Query { | ||||||
|  |                         prefix: false, | ||||||
|  |                         kind: QueryKind::exact("world".to_string()), | ||||||
|  |                     }), | ||||||
|  |                 ])], | ||||||
|  |             )), | ||||||
|             candidates: Some(candidates_1.clone()), |             candidates: Some(candidates_1.clone()), | ||||||
|             bucket_candidates: Some(candidates_1), |             bucket_candidates: Some(candidates_1), | ||||||
|             filtered_candidates: None, |             filtered_candidates: None, | ||||||
| @@ -383,22 +425,37 @@ mod test { | |||||||
|  |  | ||||||
|         assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); |         assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); | ||||||
|  |  | ||||||
|         let candidates_2 = ( |         let candidates_2 = (context.word_docids("split").unwrap().unwrap() | ||||||
|                 context.word_docids("split").unwrap().unwrap() |             & context.word_docids("this").unwrap().unwrap() | ||||||
|                 & context.word_docids("this").unwrap().unwrap() |             & context.word_docids("word").unwrap().unwrap()) | ||||||
|                 & context.word_docids("word").unwrap().unwrap() |             - context.word_docids("world").unwrap().unwrap(); | ||||||
|             ) - context.word_docids("world").unwrap().unwrap(); |  | ||||||
|         let expected_2 = CriterionResult { |         let expected_2 = CriterionResult { | ||||||
|             query_tree: Some(Operation::Or(false, vec![ |             query_tree: Some(Operation::Or( | ||||||
|                 Operation::And(vec![ |                 false, | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), |                 vec![Operation::And(vec![ | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), |                     Operation::Query(Query { | ||||||
|                     Operation::Or(false, vec![ |                         prefix: false, | ||||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), |                         kind: QueryKind::exact("split".to_string()), | ||||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), |                     }), | ||||||
|                     ]), |                     Operation::Query(Query { | ||||||
|                 ]), |                         prefix: false, | ||||||
|             ])), |                         kind: QueryKind::exact("this".to_string()), | ||||||
|  |                     }), | ||||||
|  |                     Operation::Or( | ||||||
|  |                         false, | ||||||
|  |                         vec![ | ||||||
|  |                             Operation::Query(Query { | ||||||
|  |                                 prefix: false, | ||||||
|  |                                 kind: QueryKind::exact_with_typo(1, "word".to_string()), | ||||||
|  |                             }), | ||||||
|  |                             Operation::Query(Query { | ||||||
|  |                                 prefix: false, | ||||||
|  |                                 kind: QueryKind::exact("world".to_string()), | ||||||
|  |                             }), | ||||||
|  |                         ], | ||||||
|  |                     ), | ||||||
|  |                 ])], | ||||||
|  |             )), | ||||||
|             candidates: Some(candidates_2.clone()), |             candidates: Some(candidates_2.clone()), | ||||||
|             bucket_candidates: Some(candidates_2), |             bucket_candidates: Some(candidates_2), | ||||||
|             filtered_candidates: None, |             filtered_candidates: None, | ||||||
| @@ -437,17 +494,26 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn initial_query_tree_with_facets() { |     fn initial_query_tree_with_facets() { | ||||||
|         let context = TestContext::default(); |         let context = TestContext::default(); | ||||||
|         let query_tree = Operation::Or(false, vec![ |         let query_tree = Operation::Or( | ||||||
|             Operation::And(vec![ |             false, | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), |             vec![Operation::And(vec![ | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), |                 Operation::Query(Query { | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), |                     prefix: false, | ||||||
|             ]) |                     kind: QueryKind::exact("split".to_string()), | ||||||
|         ]); |                 }), | ||||||
|  |                 Operation::Query(Query { | ||||||
|  |                     prefix: false, | ||||||
|  |                     kind: QueryKind::exact("this".to_string()), | ||||||
|  |                 }), | ||||||
|  |                 Operation::Query(Query { | ||||||
|  |                     prefix: false, | ||||||
|  |                     kind: QueryKind::tolerant(1, "world".to_string()), | ||||||
|  |                 }), | ||||||
|  |             ])], | ||||||
|  |         ); | ||||||
|  |  | ||||||
|         let facet_candidates = context.word_docids("earth").unwrap().unwrap(); |         let facet_candidates = context.word_docids("earth").unwrap().unwrap(); | ||||||
|  |  | ||||||
|  |  | ||||||
|         let mut criterion_parameters = CriterionParameters { |         let mut criterion_parameters = CriterionParameters { | ||||||
|             wdcache: &mut WordDerivationsCache::new(), |             wdcache: &mut WordDerivationsCache::new(), | ||||||
|             excluded_candidates: &RoaringBitmap::new(), |             excluded_candidates: &RoaringBitmap::new(), | ||||||
| @@ -459,13 +525,23 @@ mod test { | |||||||
|             & context.word_docids("this").unwrap().unwrap() |             & context.word_docids("this").unwrap().unwrap() | ||||||
|             & context.word_docids("world").unwrap().unwrap(); |             & context.word_docids("world").unwrap().unwrap(); | ||||||
|         let expected_1 = CriterionResult { |         let expected_1 = CriterionResult { | ||||||
|             query_tree: Some(Operation::Or(false, vec![ |             query_tree: Some(Operation::Or( | ||||||
|                 Operation::And(vec![ |                 false, | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), |                 vec![Operation::And(vec![ | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), |                     Operation::Query(Query { | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), |                         prefix: false, | ||||||
|                 ]), |                         kind: QueryKind::exact("split".to_string()), | ||||||
|             ])), |                     }), | ||||||
|  |                     Operation::Query(Query { | ||||||
|  |                         prefix: false, | ||||||
|  |                         kind: QueryKind::exact("this".to_string()), | ||||||
|  |                     }), | ||||||
|  |                     Operation::Query(Query { | ||||||
|  |                         prefix: false, | ||||||
|  |                         kind: QueryKind::exact("world".to_string()), | ||||||
|  |                     }), | ||||||
|  |                 ])], | ||||||
|  |             )), | ||||||
|             candidates: Some(&candidates_1 & &facet_candidates), |             candidates: Some(&candidates_1 & &facet_candidates), | ||||||
|             bucket_candidates: Some(&candidates_1 & &facet_candidates), |             bucket_candidates: Some(&candidates_1 & &facet_candidates), | ||||||
|             filtered_candidates: None, |             filtered_candidates: None, | ||||||
| @@ -473,22 +549,37 @@ mod test { | |||||||
|  |  | ||||||
|         assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); |         assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); | ||||||
|  |  | ||||||
|         let candidates_2 = ( |         let candidates_2 = (context.word_docids("split").unwrap().unwrap() | ||||||
|                 context.word_docids("split").unwrap().unwrap() |             & context.word_docids("this").unwrap().unwrap() | ||||||
|                 & context.word_docids("this").unwrap().unwrap() |             & context.word_docids("word").unwrap().unwrap()) | ||||||
|                 & context.word_docids("word").unwrap().unwrap() |             - context.word_docids("world").unwrap().unwrap(); | ||||||
|             ) - context.word_docids("world").unwrap().unwrap(); |  | ||||||
|         let expected_2 = CriterionResult { |         let expected_2 = CriterionResult { | ||||||
|             query_tree: Some(Operation::Or(false, vec![ |             query_tree: Some(Operation::Or( | ||||||
|                 Operation::And(vec![ |                 false, | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), |                 vec![Operation::And(vec![ | ||||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), |                     Operation::Query(Query { | ||||||
|                     Operation::Or(false, vec![ |                         prefix: false, | ||||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), |                         kind: QueryKind::exact("split".to_string()), | ||||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), |                     }), | ||||||
|                     ]), |                     Operation::Query(Query { | ||||||
|                 ]), |                         prefix: false, | ||||||
|             ])), |                         kind: QueryKind::exact("this".to_string()), | ||||||
|  |                     }), | ||||||
|  |                     Operation::Or( | ||||||
|  |                         false, | ||||||
|  |                         vec![ | ||||||
|  |                             Operation::Query(Query { | ||||||
|  |                                 prefix: false, | ||||||
|  |                                 kind: QueryKind::exact_with_typo(1, "word".to_string()), | ||||||
|  |                             }), | ||||||
|  |                             Operation::Query(Query { | ||||||
|  |                                 prefix: false, | ||||||
|  |                                 kind: QueryKind::exact("world".to_string()), | ||||||
|  |                             }), | ||||||
|  |                         ], | ||||||
|  |                     ), | ||||||
|  |                 ])], | ||||||
|  |             )), | ||||||
|             candidates: Some(&candidates_2 & &facet_candidates), |             candidates: Some(&candidates_2 & &facet_candidates), | ||||||
|             bucket_candidates: Some(&candidates_2 & &facet_candidates), |             bucket_candidates: Some(&candidates_2 & &facet_candidates), | ||||||
|             filtered_candidates: None, |             filtered_candidates: None, | ||||||
|   | |||||||
| @@ -3,9 +3,9 @@ use std::mem::take; | |||||||
| use log::debug; | use log::debug; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; | ||||||
| use crate::search::query_tree::Operation; | use crate::search::query_tree::Operation; | ||||||
| use crate::Result; | use crate::Result; | ||||||
| use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree}; |  | ||||||
|  |  | ||||||
| pub struct Words<'t> { | pub struct Words<'t> { | ||||||
|     ctx: &'t dyn Context<'t>, |     ctx: &'t dyn Context<'t>, | ||||||
| @@ -44,11 +44,12 @@ impl<'t> Criterion for Words<'t> { | |||||||
|                 Some(query_tree) => { |                 Some(query_tree) => { | ||||||
|                     let candidates = match self.candidates.as_mut() { |                     let candidates = match self.candidates.as_mut() { | ||||||
|                         Some(allowed_candidates) => { |                         Some(allowed_candidates) => { | ||||||
|                             let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; |                             let mut candidates = | ||||||
|  |                                 resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; | ||||||
|                             candidates &= &*allowed_candidates; |                             candidates &= &*allowed_candidates; | ||||||
|                             *allowed_candidates -= &candidates; |                             *allowed_candidates -= &candidates; | ||||||
|                             Some(candidates) |                             Some(candidates) | ||||||
|                         }, |                         } | ||||||
|                         None => None, |                         None => None, | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
| @@ -63,29 +64,38 @@ impl<'t> Criterion for Words<'t> { | |||||||
|                         filtered_candidates: self.filtered_candidates.clone(), |                         filtered_candidates: self.filtered_candidates.clone(), | ||||||
|                         bucket_candidates, |                         bucket_candidates, | ||||||
|                     })); |                     })); | ||||||
|                 }, |                 } | ||||||
|                 None => { |                 None => match self.parent.next(params)? { | ||||||
|                     match self.parent.next(params)? { |                     Some(CriterionResult { | ||||||
|                         Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { |                         query_tree: Some(query_tree), | ||||||
|                             self.query_trees = explode_query_tree(query_tree); |                         candidates, | ||||||
|                             self.candidates = candidates; |                         filtered_candidates, | ||||||
|                             self.filtered_candidates = filtered_candidates; |                         bucket_candidates, | ||||||
|  |                     }) => { | ||||||
|  |                         self.query_trees = explode_query_tree(query_tree); | ||||||
|  |                         self.candidates = candidates; | ||||||
|  |                         self.filtered_candidates = filtered_candidates; | ||||||
|  |  | ||||||
|                             self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { |                         self.bucket_candidates = | ||||||
|  |                             match (self.bucket_candidates.take(), bucket_candidates) { | ||||||
|                                 (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), |                                 (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), | ||||||
|                                 (self_bc, parent_bc) => self_bc.or(parent_bc), |                                 (self_bc, parent_bc) => self_bc.or(parent_bc), | ||||||
|                             }; |                             }; | ||||||
|                         }, |  | ||||||
|                         Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { |  | ||||||
|                             return Ok(Some(CriterionResult { |  | ||||||
|                                 query_tree: None, |  | ||||||
|                                 candidates, |  | ||||||
|                                 filtered_candidates, |  | ||||||
|                                 bucket_candidates, |  | ||||||
|                             })); |  | ||||||
|                         }, |  | ||||||
|                         None => return Ok(None), |  | ||||||
|                     } |                     } | ||||||
|  |                     Some(CriterionResult { | ||||||
|  |                         query_tree: None, | ||||||
|  |                         candidates, | ||||||
|  |                         filtered_candidates, | ||||||
|  |                         bucket_candidates, | ||||||
|  |                     }) => { | ||||||
|  |                         return Ok(Some(CriterionResult { | ||||||
|  |                             query_tree: None, | ||||||
|  |                             candidates, | ||||||
|  |                             filtered_candidates, | ||||||
|  |                             bucket_candidates, | ||||||
|  |                         })); | ||||||
|  |                     } | ||||||
|  |                     None => return Ok(None), | ||||||
|                 }, |                 }, | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -3,11 +3,11 @@ use std::mem::size_of; | |||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use super::{Distinct, DocIter}; | ||||||
| use crate::error::InternalError; | use crate::error::InternalError; | ||||||
| use crate::heed_codec::facet::*; | use crate::heed_codec::facet::*; | ||||||
| use crate::index::db_name; | use crate::index::db_name; | ||||||
| use crate::{DocumentId, FieldId, Index, Result}; | use crate::{DocumentId, FieldId, Index, Result}; | ||||||
| use super::{Distinct, DocIter}; |  | ||||||
|  |  | ||||||
| const FID_SIZE: usize = size_of::<FieldId>(); | const FID_SIZE: usize = size_of::<FieldId>(); | ||||||
| const DOCID_SIZE: usize = size_of::<DocumentId>(); | const DOCID_SIZE: usize = size_of::<DocumentId>(); | ||||||
| @@ -28,11 +28,7 @@ pub struct FacetDistinct<'a> { | |||||||
|  |  | ||||||
| impl<'a> FacetDistinct<'a> { | impl<'a> FacetDistinct<'a> { | ||||||
|     pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { |     pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { | ||||||
|         Self { |         Self { distinct, index, txn } | ||||||
|             distinct, |  | ||||||
|             index, |  | ||||||
|             txn, |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -47,16 +43,12 @@ pub struct FacetDistinctIter<'a> { | |||||||
|  |  | ||||||
| impl<'a> FacetDistinctIter<'a> { | impl<'a> FacetDistinctIter<'a> { | ||||||
|     fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> { |     fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|         self.index |         self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key)) | ||||||
|             .facet_id_string_docids |  | ||||||
|             .get(self.txn, &(self.distinct, key)) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> { |     fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|         // get facet docids on level 0 |         // get facet docids on level 0 | ||||||
|         self.index |         self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key)) | ||||||
|             .facet_id_f64_docids |  | ||||||
|             .get(self.txn, &(self.distinct, 0, key, key)) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn distinct_string(&mut self, id: DocumentId) -> Result<()> { |     fn distinct_string(&mut self, id: DocumentId) -> Result<()> { | ||||||
| @@ -64,9 +56,8 @@ impl<'a> FacetDistinctIter<'a> { | |||||||
|  |  | ||||||
|         for item in iter { |         for item in iter { | ||||||
|             let ((_, _, value), _) = item?; |             let ((_, _, value), _) = item?; | ||||||
|             let facet_docids = self |             let facet_docids = | ||||||
|                 .facet_string_docids(value)? |                 self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { | ||||||
|                 .ok_or(InternalError::DatabaseMissingEntry { |  | ||||||
|                     db_name: db_name::FACET_ID_STRING_DOCIDS, |                     db_name: db_name::FACET_ID_STRING_DOCIDS, | ||||||
|                     key: None, |                     key: None, | ||||||
|                 })?; |                 })?; | ||||||
| @@ -83,9 +74,8 @@ impl<'a> FacetDistinctIter<'a> { | |||||||
|  |  | ||||||
|         for item in iter { |         for item in iter { | ||||||
|             let ((_, _, value), _) = item?; |             let ((_, _, value), _) = item?; | ||||||
|             let facet_docids = self |             let facet_docids = | ||||||
|                 .facet_number_docids(value)? |                 self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { | ||||||
|                 .ok_or(InternalError::DatabaseMissingEntry { |  | ||||||
|                     db_name: db_name::FACET_ID_F64_DOCIDS, |                     db_name: db_name::FACET_ID_F64_DOCIDS, | ||||||
|                     key: None, |                     key: None, | ||||||
|                 })?; |                 })?; | ||||||
|   | |||||||
| @@ -1,11 +1,11 @@ | |||||||
| mod facet_distinct; | mod facet_distinct; | ||||||
| mod noop_distinct; | mod noop_distinct; | ||||||
|  |  | ||||||
|  | pub use facet_distinct::FacetDistinct; | ||||||
|  | pub use noop_distinct::NoopDistinct; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::{DocumentId, Result}; | use crate::{DocumentId, Result}; | ||||||
| pub use facet_distinct::FacetDistinct; |  | ||||||
| pub use noop_distinct::NoopDistinct; |  | ||||||
|  |  | ||||||
| /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. | /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. | ||||||
| /// It provides a way to get back the ownership to the excluded set. | /// It provides a way to get back the ownership to the excluded set. | ||||||
| @@ -29,13 +29,15 @@ mod test { | |||||||
|     use std::collections::HashSet; |     use std::collections::HashSet; | ||||||
|  |  | ||||||
|     use once_cell::sync::Lazy; |     use once_cell::sync::Lazy; | ||||||
|     use rand::{seq::SliceRandom, Rng}; |     use rand::seq::SliceRandom; | ||||||
|  |     use rand::Rng; | ||||||
|     use roaring::RoaringBitmap; |     use roaring::RoaringBitmap; | ||||||
|     use serde_json::{json, Value}; |     use serde_json::{json, Value}; | ||||||
|  |  | ||||||
|     use crate::index::{Index, tests::TempIndex}; |     use crate::index::tests::TempIndex; | ||||||
|  |     use crate::index::Index; | ||||||
|     use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; |     use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; | ||||||
|     use crate::{BEU32, FieldId, DocumentId}; |     use crate::{DocumentId, FieldId, BEU32}; | ||||||
|  |  | ||||||
|     static JSON: Lazy<Value> = Lazy::new(generate_json); |     static JSON: Lazy<Value> = Lazy::new(generate_json); | ||||||
|  |  | ||||||
| @@ -89,9 +91,7 @@ mod test { | |||||||
|         addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); |         addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); | ||||||
|         addition.update_format(UpdateFormat::Json); |         addition.update_format(UpdateFormat::Json); | ||||||
|  |  | ||||||
|         addition |         addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); | ||||||
|             .execute(JSON.to_string().as_bytes(), |_, _| ()) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let fields_map = index.fields_ids_map(&txn).unwrap(); |         let fields_map = index.fields_ids_map(&txn).unwrap(); | ||||||
|         let fid = fields_map.id(&distinct).unwrap(); |         let fid = fields_map.id(&distinct).unwrap(); | ||||||
| @@ -103,13 +103,12 @@ mod test { | |||||||
|         (index, fid, map) |         (index, fid, map) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|     /// Checks that all the candidates are distinct, and returns the candidates number. |     /// Checks that all the candidates are distinct, and returns the candidates number. | ||||||
|     pub(crate) fn validate_distinct_candidates( |     pub(crate) fn validate_distinct_candidates( | ||||||
|         candidates: impl Iterator<Item = crate::Result<DocumentId>>, |         candidates: impl Iterator<Item = crate::Result<DocumentId>>, | ||||||
|         distinct: FieldId, |         distinct: FieldId, | ||||||
|         index: &Index, |         index: &Index, | ||||||
|         ) -> usize { |     ) -> usize { | ||||||
|         fn test(seen: &mut HashSet<String>, value: &Value) { |         fn test(seen: &mut HashSet<String>, value: &Value) { | ||||||
|             match value { |             match value { | ||||||
|                 Value::Null | Value::Object(_) | Value::Bool(_) => (), |                 Value::Null | Value::Object(_) | Value::Bool(_) => (), | ||||||
| @@ -117,7 +116,7 @@ mod test { | |||||||
|                     let s = value.to_string(); |                     let s = value.to_string(); | ||||||
|                     assert!(seen.insert(s)); |                     assert!(seen.insert(s)); | ||||||
|                 } |                 } | ||||||
|                 Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))} |                 Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)), | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,7 +1,8 @@ | |||||||
| use roaring::{RoaringBitmap, bitmap::IntoIter}; | use roaring::bitmap::IntoIter; | ||||||
|  | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use super::{Distinct, DocIter}; | ||||||
| use crate::{DocumentId, Result}; | use crate::{DocumentId, Result}; | ||||||
| use super::{DocIter, Distinct}; |  | ||||||
|  |  | ||||||
| /// A distinct implementer that does not perform any distinct, | /// A distinct implementer that does not perform any distinct, | ||||||
| /// and simply returns an iterator to the candidates. | /// and simply returns an iterator to the candidates. | ||||||
| @@ -30,10 +31,7 @@ impl Distinct for NoopDistinct { | |||||||
|     type Iter = NoopDistinctIter; |     type Iter = NoopDistinctIter; | ||||||
|  |  | ||||||
|     fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { |     fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { | ||||||
|         NoopDistinctIter { |         NoopDistinctIter { candidates: candidates.into_iter(), excluded } | ||||||
|             candidates: candidates.into_iter(), |  | ||||||
|             excluded, |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,16 +1,16 @@ | |||||||
| use std::collections::{HashSet, BTreeMap}; | use std::collections::{BTreeMap, HashSet}; | ||||||
| use std::ops::Bound::Unbounded; | use std::ops::Bound::Unbounded; | ||||||
| use std::{cmp, fmt}; | use std::{cmp, fmt}; | ||||||
|  |  | ||||||
| use heed::{Database, BytesDecode}; |  | ||||||
| use heed::types::{ByteSlice, Unit}; | use heed::types::{ByteSlice, Unit}; | ||||||
|  | use heed::{BytesDecode, Database}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::error::FieldIdMapMissingEntry; | use crate::error::FieldIdMapMissingEntry; | ||||||
| use crate::facet::FacetType; | use crate::facet::FacetType; | ||||||
| use crate::heed_codec::facet::FacetValueStringCodec; | use crate::heed_codec::facet::FacetValueStringCodec; | ||||||
| use crate::search::facet::{FacetIter, FacetRange}; | use crate::search::facet::{FacetIter, FacetRange}; | ||||||
| use crate::{Index, FieldId, DocumentId, Result}; | use crate::{DocumentId, FieldId, Index, Result}; | ||||||
|  |  | ||||||
| /// The default number of values by facets that will | /// The default number of values by facets that will | ||||||
| /// be fetched from the key-value store. | /// be fetched from the key-value store. | ||||||
| @@ -43,7 +43,7 @@ impl<'a> FacetDistribution<'a> { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn facets<I: IntoIterator<Item=A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self { |     pub fn facets<I: IntoIterator<Item = A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self { | ||||||
|         self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect()); |         self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect()); | ||||||
|         self |         self | ||||||
|     } |     } | ||||||
| @@ -66,8 +66,7 @@ impl<'a> FacetDistribution<'a> { | |||||||
|         facet_type: FacetType, |         facet_type: FacetType, | ||||||
|         candidates: &RoaringBitmap, |         candidates: &RoaringBitmap, | ||||||
|         distribution: &mut BTreeMap<String, u64>, |         distribution: &mut BTreeMap<String, u64>, | ||||||
|     ) -> heed::Result<()> |     ) -> heed::Result<()> { | ||||||
|     { |  | ||||||
|         fn fetch_facet_values<'t, KC, K: 't>( |         fn fetch_facet_values<'t, KC, K: 't>( | ||||||
|             rtxn: &'t heed::RoTxn, |             rtxn: &'t heed::RoTxn, | ||||||
|             db: Database<KC, Unit>, |             db: Database<KC, Unit>, | ||||||
| @@ -102,7 +101,7 @@ impl<'a> FacetDistribution<'a> { | |||||||
|             FacetType::Number => { |             FacetType::Number => { | ||||||
|                 let db = self.index.field_id_docid_facet_f64s; |                 let db = self.index.field_id_docid_facet_f64s; | ||||||
|                 fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) |                 fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) | ||||||
|             }, |             } | ||||||
|             FacetType::String => { |             FacetType::String => { | ||||||
|                 let db = self.index.field_id_docid_facet_strings; |                 let db = self.index.field_id_docid_facet_strings; | ||||||
|                 fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) |                 fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) | ||||||
| @@ -117,11 +116,9 @@ impl<'a> FacetDistribution<'a> { | |||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         candidates: &RoaringBitmap, |         candidates: &RoaringBitmap, | ||||||
|         distribution: &mut BTreeMap<String, u64>, |         distribution: &mut BTreeMap<String, u64>, | ||||||
|     ) -> heed::Result<()> |     ) -> heed::Result<()> { | ||||||
|     { |         let iter = | ||||||
|         let iter = FacetIter::new_non_reducing( |             FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; | ||||||
|             self.rtxn, self.index, field_id, candidates.clone(), |  | ||||||
|         )?; |  | ||||||
|  |  | ||||||
|         for result in iter { |         for result in iter { | ||||||
|             let (value, mut docids) = result?; |             let (value, mut docids) = result?; | ||||||
| @@ -142,8 +139,7 @@ impl<'a> FacetDistribution<'a> { | |||||||
|     fn facet_values_from_raw_facet_database( |     fn facet_values_from_raw_facet_database( | ||||||
|         &self, |         &self, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|     ) -> heed::Result<BTreeMap<String, u64>> |     ) -> heed::Result<BTreeMap<String, u64>> { | ||||||
|     { |  | ||||||
|         let mut distribution = BTreeMap::new(); |         let mut distribution = BTreeMap::new(); | ||||||
|  |  | ||||||
|         let db = self.index.facet_id_f64_docids; |         let db = self.index.facet_id_f64_docids; | ||||||
| @@ -157,7 +153,8 @@ impl<'a> FacetDistribution<'a> { | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let iter = self.index |         let iter = self | ||||||
|  |             .index | ||||||
|             .facet_id_string_docids |             .facet_id_string_docids | ||||||
|             .remap_key_type::<ByteSlice>() |             .remap_key_type::<ByteSlice>() | ||||||
|             .prefix_iter(self.rtxn, &[field_id])? |             .prefix_iter(self.rtxn, &[field_id])? | ||||||
| @@ -182,11 +179,30 @@ impl<'a> FacetDistribution<'a> { | |||||||
|             // to those candidates. We also enter here for facet strings for performance reasons. |             // to those candidates. We also enter here for facet strings for performance reasons. | ||||||
|             let mut distribution = BTreeMap::new(); |             let mut distribution = BTreeMap::new(); | ||||||
|             if candidates.len() <= CANDIDATES_THRESHOLD { |             if candidates.len() <= CANDIDATES_THRESHOLD { | ||||||
|                 self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?; |                 self.facet_distribution_from_documents( | ||||||
|                 self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; |                     field_id, | ||||||
|  |                     Number, | ||||||
|  |                     candidates, | ||||||
|  |                     &mut distribution, | ||||||
|  |                 )?; | ||||||
|  |                 self.facet_distribution_from_documents( | ||||||
|  |                     field_id, | ||||||
|  |                     String, | ||||||
|  |                     candidates, | ||||||
|  |                     &mut distribution, | ||||||
|  |                 )?; | ||||||
|             } else { |             } else { | ||||||
|                 self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?; |                 self.facet_numbers_distribution_from_facet_levels( | ||||||
|                 self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; |                     field_id, | ||||||
|  |                     candidates, | ||||||
|  |                     &mut distribution, | ||||||
|  |                 )?; | ||||||
|  |                 self.facet_distribution_from_documents( | ||||||
|  |                     field_id, | ||||||
|  |                     String, | ||||||
|  |                     candidates, | ||||||
|  |                     &mut distribution, | ||||||
|  |                 )?; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             Ok(distribution) |             Ok(distribution) | ||||||
| @@ -201,10 +217,11 @@ impl<'a> FacetDistribution<'a> { | |||||||
|  |  | ||||||
|         let mut distribution = BTreeMap::new(); |         let mut distribution = BTreeMap::new(); | ||||||
|         for name in filterable_fields { |         for name in filterable_fields { | ||||||
|             let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { |             let fid = | ||||||
|                 field_name: name.clone(), |                 fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { | ||||||
|                 process: "FacetDistribution::execute", |                     field_name: name.clone(), | ||||||
|             })?; |                     process: "FacetDistribution::execute", | ||||||
|  |                 })?; | ||||||
|             let values = self.facet_values(fid)?; |             let values = self.facet_values(fid)?; | ||||||
|             distribution.insert(name, values); |             distribution.insert(name, values); | ||||||
|         } |         } | ||||||
| @@ -215,13 +232,7 @@ impl<'a> FacetDistribution<'a> { | |||||||
|  |  | ||||||
| impl fmt::Debug for FacetDistribution<'_> { | impl fmt::Debug for FacetDistribution<'_> { | ||||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|         let FacetDistribution { |         let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; | ||||||
|             facets, |  | ||||||
|             candidates, |  | ||||||
|             max_values_by_facet, |  | ||||||
|             rtxn: _, |  | ||||||
|             index: _, |  | ||||||
|         } = self; |  | ||||||
|  |  | ||||||
|         f.debug_struct("FacetDistribution") |         f.debug_struct("FacetDistribution") | ||||||
|             .field("facets", facets) |             .field("facets", facets) | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| use std::collections::HashSet; | use std::collections::HashSet; | ||||||
| use std::fmt::Debug; | use std::fmt::Debug; | ||||||
| use std::ops::Bound::{self, Included, Excluded}; | use std::ops::Bound::{self, Excluded, Included}; | ||||||
| use std::result::Result as StdResult; | use std::result::Result as StdResult; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
|  |  | ||||||
| @@ -12,16 +12,13 @@ use pest::iterators::{Pair, Pairs}; | |||||||
| use pest::Parser; | use pest::Parser; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::error::UserError; |  | ||||||
| use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; |  | ||||||
| use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result}; |  | ||||||
|  |  | ||||||
| use super::FacetRange; |  | ||||||
| use super::parser::Rule; |  | ||||||
| use super::parser::{PREC_CLIMBER, FilterParser}; |  | ||||||
|  |  | ||||||
| use self::FilterCondition::*; | use self::FilterCondition::*; | ||||||
| use self::Operator::*; | use self::Operator::*; | ||||||
|  | use super::parser::{FilterParser, Rule, PREC_CLIMBER}; | ||||||
|  | use super::FacetRange; | ||||||
|  | use crate::error::UserError; | ||||||
|  | use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; | ||||||
|  | use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, PartialEq)] | #[derive(Debug, Clone, PartialEq)] | ||||||
| pub enum Operator { | pub enum Operator { | ||||||
| @@ -39,13 +36,13 @@ impl Operator { | |||||||
|     /// an OR operation for the between case (i.e. `TO`). |     /// an OR operation for the between case (i.e. `TO`). | ||||||
|     fn negate(self) -> (Self, Option<Self>) { |     fn negate(self) -> (Self, Option<Self>) { | ||||||
|         match self { |         match self { | ||||||
|             GreaterThan(n)        => (LowerThanOrEqual(n), None), |             GreaterThan(n) => (LowerThanOrEqual(n), None), | ||||||
|             GreaterThanOrEqual(n) => (LowerThan(n), None), |             GreaterThanOrEqual(n) => (LowerThan(n), None), | ||||||
|             Equal(n, s)           => (NotEqual(n, s), None), |             Equal(n, s) => (NotEqual(n, s), None), | ||||||
|             NotEqual(n, s)        => (Equal(n, s), None), |             NotEqual(n, s) => (Equal(n, s), None), | ||||||
|             LowerThan(n)          => (GreaterThanOrEqual(n), None), |             LowerThan(n) => (GreaterThanOrEqual(n), None), | ||||||
|             LowerThanOrEqual(n)   => (GreaterThan(n), None), |             LowerThanOrEqual(n) => (GreaterThan(n), None), | ||||||
|             Between(n, m)         => (LowerThan(n), Some(GreaterThan(m))), |             Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -63,10 +60,11 @@ impl FilterCondition { | |||||||
|         index: &Index, |         index: &Index, | ||||||
|         array: I, |         array: I, | ||||||
|     ) -> Result<Option<FilterCondition>> |     ) -> Result<Option<FilterCondition>> | ||||||
|     where I: IntoIterator<Item=Either<J, B>>, |     where | ||||||
|           J: IntoIterator<Item=A>, |         I: IntoIterator<Item = Either<J, B>>, | ||||||
|           A: AsRef<str>, |         J: IntoIterator<Item = A>, | ||||||
|           B: AsRef<str>, |         A: AsRef<str>, | ||||||
|  |         B: AsRef<str>, | ||||||
|     { |     { | ||||||
|         let mut ands = None; |         let mut ands = None; | ||||||
|  |  | ||||||
| @@ -88,7 +86,7 @@ impl FilterCondition { | |||||||
|                             None => Some(rule), |                             None => Some(rule), | ||||||
|                         }; |                         }; | ||||||
|                     } |                     } | ||||||
|                 }, |                 } | ||||||
|                 Either::Right(rule) => { |                 Either::Right(rule) => { | ||||||
|                     let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; |                     let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; | ||||||
|                     ands = match ands.take() { |                     ands = match ands.take() { | ||||||
| @@ -106,11 +104,11 @@ impl FilterCondition { | |||||||
|         rtxn: &heed::RoTxn, |         rtxn: &heed::RoTxn, | ||||||
|         index: &Index, |         index: &Index, | ||||||
|         expression: &str, |         expression: &str, | ||||||
|     ) -> Result<FilterCondition> |     ) -> Result<FilterCondition> { | ||||||
|     { |  | ||||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; |         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||||
|         let filterable_fields = index.filterable_fields_ids(rtxn)?; |         let filterable_fields = index.filterable_fields_ids(rtxn)?; | ||||||
|         let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; |         let lexed = | ||||||
|  |             FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; | ||||||
|         FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) |         FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -118,8 +116,7 @@ impl FilterCondition { | |||||||
|         fim: &FieldsIdsMap, |         fim: &FieldsIdsMap, | ||||||
|         ff: &HashSet<FieldId>, |         ff: &HashSet<FieldId>, | ||||||
|         expression: Pairs<Rule>, |         expression: Pairs<Rule>, | ||||||
|     ) -> Result<Self> |     ) -> Result<Self> { | ||||||
|     { |  | ||||||
|         PREC_CLIMBER.climb( |         PREC_CLIMBER.climb( | ||||||
|             expression, |             expression, | ||||||
|             |pair: Pair<Rule>| match pair.as_rule() { |             |pair: Pair<Rule>| match pair.as_rule() { | ||||||
| @@ -135,12 +132,10 @@ impl FilterCondition { | |||||||
|                 Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), |                 Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), | ||||||
|                 _ => unreachable!(), |                 _ => unreachable!(), | ||||||
|             }, |             }, | ||||||
|             |lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| { |             |lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| match op.as_rule() { | ||||||
|                 match op.as_rule() { |                 Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), | ||||||
|                     Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), |                 Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), | ||||||
|                     Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), |                 _ => unreachable!(), | ||||||
|                     _ => unreachable!(), |  | ||||||
|                 } |  | ||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|     } |     } | ||||||
| @@ -160,8 +155,7 @@ impl FilterCondition { | |||||||
|         fields_ids_map: &FieldsIdsMap, |         fields_ids_map: &FieldsIdsMap, | ||||||
|         filterable_fields: &HashSet<FieldId>, |         filterable_fields: &HashSet<FieldId>, | ||||||
|         item: Pair<Rule>, |         item: Pair<Rule>, | ||||||
|     ) -> Result<FilterCondition> |     ) -> Result<FilterCondition> { | ||||||
|     { |  | ||||||
|         let mut items = item.into_inner(); |         let mut items = item.into_inner(); | ||||||
|         let fid = field_id(fields_ids_map, filterable_fields, &mut items) |         let fid = field_id(fields_ids_map, filterable_fields, &mut items) | ||||||
|             .map_err(UserError::InvalidFilterAttribute)?; |             .map_err(UserError::InvalidFilterAttribute)?; | ||||||
| @@ -179,8 +173,7 @@ impl FilterCondition { | |||||||
|         fields_ids_map: &FieldsIdsMap, |         fields_ids_map: &FieldsIdsMap, | ||||||
|         filterable_fields: &HashSet<FieldId>, |         filterable_fields: &HashSet<FieldId>, | ||||||
|         item: Pair<Rule>, |         item: Pair<Rule>, | ||||||
|     ) -> Result<FilterCondition> |     ) -> Result<FilterCondition> { | ||||||
|     { |  | ||||||
|         let mut items = item.into_inner(); |         let mut items = item.into_inner(); | ||||||
|         let fid = field_id(fields_ids_map, filterable_fields, &mut items) |         let fid = field_id(fields_ids_map, filterable_fields, &mut items) | ||||||
|             .map_err(UserError::InvalidFilterAttribute)?; |             .map_err(UserError::InvalidFilterAttribute)?; | ||||||
| @@ -196,8 +189,7 @@ impl FilterCondition { | |||||||
|         fields_ids_map: &FieldsIdsMap, |         fields_ids_map: &FieldsIdsMap, | ||||||
|         filterable_fields: &HashSet<FieldId>, |         filterable_fields: &HashSet<FieldId>, | ||||||
|         item: Pair<Rule>, |         item: Pair<Rule>, | ||||||
|     ) -> Result<FilterCondition> |     ) -> Result<FilterCondition> { | ||||||
|     { |  | ||||||
|         let mut items = item.into_inner(); |         let mut items = item.into_inner(); | ||||||
|         let fid = field_id(fields_ids_map, filterable_fields, &mut items) |         let fid = field_id(fields_ids_map, filterable_fields, &mut items) | ||||||
|             .map_err(UserError::InvalidFilterAttribute)?; |             .map_err(UserError::InvalidFilterAttribute)?; | ||||||
| @@ -213,8 +205,7 @@ impl FilterCondition { | |||||||
|         fields_ids_map: &FieldsIdsMap, |         fields_ids_map: &FieldsIdsMap, | ||||||
|         filterable_fields: &HashSet<FieldId>, |         filterable_fields: &HashSet<FieldId>, | ||||||
|         item: Pair<Rule>, |         item: Pair<Rule>, | ||||||
|     ) -> Result<FilterCondition> |     ) -> Result<FilterCondition> { | ||||||
|     { |  | ||||||
|         let mut items = item.into_inner(); |         let mut items = item.into_inner(); | ||||||
|         let fid = field_id(fields_ids_map, filterable_fields, &mut items) |         let fid = field_id(fields_ids_map, filterable_fields, &mut items) | ||||||
|             .map_err(UserError::InvalidFilterAttribute)?; |             .map_err(UserError::InvalidFilterAttribute)?; | ||||||
| @@ -230,8 +221,7 @@ impl FilterCondition { | |||||||
|         fields_ids_map: &FieldsIdsMap, |         fields_ids_map: &FieldsIdsMap, | ||||||
|         filterable_fields: &HashSet<FieldId>, |         filterable_fields: &HashSet<FieldId>, | ||||||
|         item: Pair<Rule>, |         item: Pair<Rule>, | ||||||
|     ) -> Result<FilterCondition> |     ) -> Result<FilterCondition> { | ||||||
|     { |  | ||||||
|         let mut items = item.into_inner(); |         let mut items = item.into_inner(); | ||||||
|         let fid = field_id(fields_ids_map, filterable_fields, &mut items) |         let fid = field_id(fields_ids_map, filterable_fields, &mut items) | ||||||
|             .map_err(UserError::InvalidFilterAttribute)?; |             .map_err(UserError::InvalidFilterAttribute)?; | ||||||
| @@ -247,8 +237,7 @@ impl FilterCondition { | |||||||
|         fields_ids_map: &FieldsIdsMap, |         fields_ids_map: &FieldsIdsMap, | ||||||
|         filterable_fields: &HashSet<FieldId>, |         filterable_fields: &HashSet<FieldId>, | ||||||
|         item: Pair<Rule>, |         item: Pair<Rule>, | ||||||
|     ) -> Result<FilterCondition> |     ) -> Result<FilterCondition> { | ||||||
|     { |  | ||||||
|         let mut items = item.into_inner(); |         let mut items = item.into_inner(); | ||||||
|         let fid = field_id(fields_ids_map, filterable_fields, &mut items) |         let fid = field_id(fields_ids_map, filterable_fields, &mut items) | ||||||
|             .map_err(UserError::InvalidFilterAttribute)?; |             .map_err(UserError::InvalidFilterAttribute)?; | ||||||
| @@ -272,13 +261,14 @@ impl FilterCondition { | |||||||
|         left: Bound<f64>, |         left: Bound<f64>, | ||||||
|         right: Bound<f64>, |         right: Bound<f64>, | ||||||
|         output: &mut RoaringBitmap, |         output: &mut RoaringBitmap, | ||||||
|     ) -> Result<()> |     ) -> Result<()> { | ||||||
|     { |  | ||||||
|         match (left, right) { |         match (left, right) { | ||||||
|             // If the request is an exact value we must go directly to the deepest level. |             // If the request is an exact value we must go directly to the deepest level. | ||||||
|             (Included(l), Included(r)) if l == r && level > 0 => { |             (Included(l), Included(r)) if l == r && level > 0 => { | ||||||
|                 return Self::explore_facet_number_levels(rtxn, db, field_id, 0, left, right, output); |                 return Self::explore_facet_number_levels( | ||||||
|             }, |                     rtxn, db, field_id, 0, left, right, output, | ||||||
|  |                 ); | ||||||
|  |             } | ||||||
|             // lower TO upper when lower > upper must return no result |             // lower TO upper when lower > upper must return no result | ||||||
|             (Included(l), Included(r)) if l > r => return Ok(()), |             (Included(l), Included(r)) if l > r => return Ok(()), | ||||||
|             (Included(l), Excluded(r)) if l >= r => return Ok(()), |             (Included(l), Excluded(r)) if l >= r => return Ok(()), | ||||||
| @@ -301,7 +291,9 @@ impl FilterCondition { | |||||||
|             debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); |             debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); | ||||||
|             output.union_with(&docids); |             output.union_with(&docids); | ||||||
|             // We save the leftest and rightest bounds we actually found at this level. |             // We save the leftest and rightest bounds we actually found at this level. | ||||||
|             if i == 0 { left_found = Some(l); } |             if i == 0 { | ||||||
|  |                 left_found = Some(l); | ||||||
|  |             } | ||||||
|             right_found = Some(r); |             right_found = Some(r); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -318,20 +310,50 @@ impl FilterCondition { | |||||||
|                 // If the bound is satisfied we avoid calling this function again. |                 // If the bound is satisfied we avoid calling this function again. | ||||||
|                 if !matches!(left, Included(l) if l == left_found) { |                 if !matches!(left, Included(l) if l == left_found) { | ||||||
|                     let sub_right = Excluded(left_found); |                     let sub_right = Excluded(left_found); | ||||||
|                     debug!("calling left with {:?} to {:?} (level {})",  left, sub_right, deeper_level); |                     debug!( | ||||||
|                     Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, sub_right, output)?; |                         "calling left with {:?} to {:?} (level {})", | ||||||
|  |                         left, sub_right, deeper_level | ||||||
|  |                     ); | ||||||
|  |                     Self::explore_facet_number_levels( | ||||||
|  |                         rtxn, | ||||||
|  |                         db, | ||||||
|  |                         field_id, | ||||||
|  |                         deeper_level, | ||||||
|  |                         left, | ||||||
|  |                         sub_right, | ||||||
|  |                         output, | ||||||
|  |                     )?; | ||||||
|                 } |                 } | ||||||
|                 if !matches!(right, Included(r) if r == right_found) { |                 if !matches!(right, Included(r) if r == right_found) { | ||||||
|                     let sub_left = Excluded(right_found); |                     let sub_left = Excluded(right_found); | ||||||
|                     debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); |                     debug!( | ||||||
|                     Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, sub_left, right, output)?; |                         "calling right with {:?} to {:?} (level {})", | ||||||
|  |                         sub_left, right, deeper_level | ||||||
|  |                     ); | ||||||
|  |                     Self::explore_facet_number_levels( | ||||||
|  |                         rtxn, | ||||||
|  |                         db, | ||||||
|  |                         field_id, | ||||||
|  |                         deeper_level, | ||||||
|  |                         sub_left, | ||||||
|  |                         right, | ||||||
|  |                         output, | ||||||
|  |                     )?; | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             None => { |             None => { | ||||||
|                 // If we found nothing at this level it means that we must find |                 // If we found nothing at this level it means that we must find | ||||||
|                 // the same bounds but at a deeper, more precise level. |                 // the same bounds but at a deeper, more precise level. | ||||||
|                 Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, right, output)?; |                 Self::explore_facet_number_levels( | ||||||
|             }, |                     rtxn, | ||||||
|  |                     db, | ||||||
|  |                     field_id, | ||||||
|  |                     deeper_level, | ||||||
|  |                     left, | ||||||
|  |                     right, | ||||||
|  |                     output, | ||||||
|  |                 )?; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -344,27 +366,34 @@ impl FilterCondition { | |||||||
|         strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>, |         strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         operator: &Operator, |         operator: &Operator, | ||||||
|     ) -> Result<RoaringBitmap> |     ) -> Result<RoaringBitmap> { | ||||||
|     { |  | ||||||
|         // Make sure we always bound the ranges with the field id and the level, |         // Make sure we always bound the ranges with the field id and the level, | ||||||
|         // as the facets values are all in the same database and prefixed by the |         // as the facets values are all in the same database and prefixed by the | ||||||
|         // field id and the level. |         // field id and the level. | ||||||
|         let (left, right) = match operator { |         let (left, right) = match operator { | ||||||
|             GreaterThan(val)        => (Excluded(*val), Included(f64::MAX)), |             GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), | ||||||
|             GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), |             GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), | ||||||
|             Equal(number, string)   => { |             Equal(number, string) => { | ||||||
|                 let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); |                 let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); | ||||||
|                 let number_docids = match number { |                 let number_docids = match number { | ||||||
|                     Some(n) => { |                     Some(n) => { | ||||||
|                         let n = Included(*n); |                         let n = Included(*n); | ||||||
|                         let mut output = RoaringBitmap::new(); |                         let mut output = RoaringBitmap::new(); | ||||||
|                         Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?; |                         Self::explore_facet_number_levels( | ||||||
|  |                             rtxn, | ||||||
|  |                             numbers_db, | ||||||
|  |                             field_id, | ||||||
|  |                             0, | ||||||
|  |                             n, | ||||||
|  |                             n, | ||||||
|  |                             &mut output, | ||||||
|  |                         )?; | ||||||
|                         output |                         output | ||||||
|                     }, |                     } | ||||||
|                     None => RoaringBitmap::new(), |                     None => RoaringBitmap::new(), | ||||||
|                 }; |                 }; | ||||||
|                 return Ok(string_docids | number_docids); |                 return Ok(string_docids | number_docids); | ||||||
|             }, |             } | ||||||
|             NotEqual(number, string) => { |             NotEqual(number, string) => { | ||||||
|                 let all_numbers_ids = if number.is_some() { |                 let all_numbers_ids = if number.is_some() { | ||||||
|                     index.number_faceted_documents_ids(rtxn, field_id)? |                     index.number_faceted_documents_ids(rtxn, field_id)? | ||||||
| @@ -373,12 +402,14 @@ impl FilterCondition { | |||||||
|                 }; |                 }; | ||||||
|                 let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; |                 let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; | ||||||
|                 let operator = Equal(*number, string.clone()); |                 let operator = Equal(*number, string.clone()); | ||||||
|                 let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?; |                 let docids = Self::evaluate_operator( | ||||||
|  |                     rtxn, index, numbers_db, strings_db, field_id, &operator, | ||||||
|  |                 )?; | ||||||
|                 return Ok((all_numbers_ids | all_strings_ids) - docids); |                 return Ok((all_numbers_ids | all_strings_ids) - docids); | ||||||
|             }, |             } | ||||||
|             LowerThan(val)        => (Included(f64::MIN), Excluded(*val)), |             LowerThan(val) => (Included(f64::MIN), Excluded(*val)), | ||||||
|             LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), |             LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), | ||||||
|             Between(left, right)  => (Included(*left),    Included(*right)), |             Between(left, right) => (Included(*left), Included(*right)), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // Ask for the biggest value that can exist for this specific field, if it exists |         // Ask for the biggest value that can exist for this specific field, if it exists | ||||||
| @@ -391,36 +422,39 @@ impl FilterCondition { | |||||||
|         match biggest_level { |         match biggest_level { | ||||||
|             Some(level) => { |             Some(level) => { | ||||||
|                 let mut output = RoaringBitmap::new(); |                 let mut output = RoaringBitmap::new(); | ||||||
|                 Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?; |                 Self::explore_facet_number_levels( | ||||||
|  |                     rtxn, | ||||||
|  |                     numbers_db, | ||||||
|  |                     field_id, | ||||||
|  |                     level, | ||||||
|  |                     left, | ||||||
|  |                     right, | ||||||
|  |                     &mut output, | ||||||
|  |                 )?; | ||||||
|                 Ok(output) |                 Ok(output) | ||||||
|             }, |             } | ||||||
|             None => Ok(RoaringBitmap::new()), |             None => Ok(RoaringBitmap::new()), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn evaluate( |     pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { | ||||||
|         &self, |  | ||||||
|         rtxn: &heed::RoTxn, |  | ||||||
|         index: &Index, |  | ||||||
|     ) -> Result<RoaringBitmap> |  | ||||||
|     { |  | ||||||
|         let numbers_db = index.facet_id_f64_docids; |         let numbers_db = index.facet_id_f64_docids; | ||||||
|         let strings_db = index.facet_id_string_docids; |         let strings_db = index.facet_id_string_docids; | ||||||
|  |  | ||||||
|         match self { |         match self { | ||||||
|             Operator(fid, op) => { |             Operator(fid, op) => { | ||||||
|                 Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) |                 Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) | ||||||
|             }, |             } | ||||||
|             Or(lhs, rhs) => { |             Or(lhs, rhs) => { | ||||||
|                 let lhs = lhs.evaluate(rtxn, index)?; |                 let lhs = lhs.evaluate(rtxn, index)?; | ||||||
|                 let rhs = rhs.evaluate(rtxn, index)?; |                 let rhs = rhs.evaluate(rtxn, index)?; | ||||||
|                 Ok(lhs | rhs) |                 Ok(lhs | rhs) | ||||||
|             }, |             } | ||||||
|             And(lhs, rhs) => { |             And(lhs, rhs) => { | ||||||
|                 let lhs = lhs.evaluate(rtxn, index)?; |                 let lhs = lhs.evaluate(rtxn, index)?; | ||||||
|                 let rhs = rhs.evaluate(rtxn, index)?; |                 let rhs = rhs.evaluate(rtxn, index)?; | ||||||
|                 Ok(lhs & rhs) |                 Ok(lhs & rhs) | ||||||
|             }, |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -434,23 +468,24 @@ fn field_id( | |||||||
|     fields_ids_map: &FieldsIdsMap, |     fields_ids_map: &FieldsIdsMap, | ||||||
|     filterable_fields: &HashSet<FieldId>, |     filterable_fields: &HashSet<FieldId>, | ||||||
|     items: &mut Pairs<Rule>, |     items: &mut Pairs<Rule>, | ||||||
| ) -> StdResult<FieldId, PestError<Rule>> | ) -> StdResult<FieldId, PestError<Rule>> { | ||||||
| { |  | ||||||
|     // lexing ensures that we at least have a key |     // lexing ensures that we at least have a key | ||||||
|     let key = items.next().unwrap(); |     let key = items.next().unwrap(); | ||||||
|  |  | ||||||
|     let field_id = match fields_ids_map.id(key.as_str()) { |     let field_id = match fields_ids_map.id(key.as_str()) { | ||||||
|         Some(field_id) => field_id, |         Some(field_id) => field_id, | ||||||
|         None => return Err(PestError::new_from_span( |         None => { | ||||||
|             ErrorVariant::CustomError { |             return Err(PestError::new_from_span( | ||||||
|                 message: format!( |                 ErrorVariant::CustomError { | ||||||
|                     "attribute `{}` not found, available attributes are: {}", |                     message: format!( | ||||||
|                     key.as_str(), |                         "attribute `{}` not found, available attributes are: {}", | ||||||
|                     fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", "), |                         key.as_str(), | ||||||
|                 ), |                         fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", "), | ||||||
|             }, |                     ), | ||||||
|             key.as_span(), |                 }, | ||||||
|         )), |                 key.as_span(), | ||||||
|  |             )) | ||||||
|  |         } | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     if !filterable_fields.contains(&field_id) { |     if !filterable_fields.contains(&field_id) { | ||||||
| @@ -459,9 +494,11 @@ fn field_id( | |||||||
|                 message: format!( |                 message: format!( | ||||||
|                     "attribute `{}` is not filterable, available filterable attributes are: {}", |                     "attribute `{}` is not filterable, available filterable attributes are: {}", | ||||||
|                     key.as_str(), |                     key.as_str(), | ||||||
|                     filterable_fields.iter().flat_map(|id| { |                     filterable_fields | ||||||
|                         fields_ids_map.name(*id) |                         .iter() | ||||||
|                     }).collect::<Vec<_>>().join(", "), |                         .flat_map(|id| { fields_ids_map.name(*id) }) | ||||||
|  |                         .collect::<Vec<_>>() | ||||||
|  |                         .join(", "), | ||||||
|                 ), |                 ), | ||||||
|             }, |             }, | ||||||
|             key.as_span(), |             key.as_span(), | ||||||
| @@ -476,8 +513,9 @@ fn field_id( | |||||||
| /// | /// | ||||||
| /// Returns the parsing error associated with the span if the conversion fails. | /// Returns the parsing error associated with the span if the conversion fails. | ||||||
| fn pest_parse<T>(pair: Pair<Rule>) -> (StdResult<T, pest::error::Error<Rule>>, String) | fn pest_parse<T>(pair: Pair<Rule>) -> (StdResult<T, pest::error::Error<Rule>>, String) | ||||||
| where T: FromStr, | where | ||||||
|       T::Err: ToString, |     T: FromStr, | ||||||
|  |     T::Err: ToString, | ||||||
| { | { | ||||||
|     let result = match pair.as_str().parse::<T>() { |     let result = match pair.as_str().parse::<T>() { | ||||||
|         Ok(value) => Ok(value), |         Ok(value) => Ok(value), | ||||||
| @@ -492,11 +530,12 @@ where T: FromStr, | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |     use big_s::S; | ||||||
|     use crate::update::Settings; |  | ||||||
|     use heed::EnvOpenOptions; |     use heed::EnvOpenOptions; | ||||||
|     use maplit::hashset; |     use maplit::hashset; | ||||||
|     use big_s::S; |  | ||||||
|  |     use super::*; | ||||||
|  |     use crate::update::Settings; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn string() { |     fn string() { | ||||||
| @@ -508,7 +547,7 @@ mod tests { | |||||||
|         // Set the filterable fields to be the channel. |         // Set the filterable fields to be the channel. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = Settings::new(&mut wtxn, &index, 0); |         let mut builder = Settings::new(&mut wtxn, &index, 0); | ||||||
|         builder.set_filterable_fields(hashset!{ S("channel") }); |         builder.set_filterable_fields(hashset! { S("channel") }); | ||||||
|         builder.execute(|_, _| ()).unwrap(); |         builder.execute(|_, _| ()).unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -537,7 +576,7 @@ mod tests { | |||||||
|         // Set the filterable fields to be the channel. |         // Set the filterable fields to be the channel. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = Settings::new(&mut wtxn, &index, 0); |         let mut builder = Settings::new(&mut wtxn, &index, 0); | ||||||
|         builder.set_filterable_fields(hashset!{ "timestamp".into() }); |         builder.set_filterable_fields(hashset! { "timestamp".into() }); | ||||||
|         builder.execute(|_, _| ()).unwrap(); |         builder.execute(|_, _| ()).unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -548,10 +587,8 @@ mod tests { | |||||||
|         assert_eq!(condition, expected); |         assert_eq!(condition, expected); | ||||||
|  |  | ||||||
|         let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); |         let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); | ||||||
|         let expected = Or( |         let expected = | ||||||
|             Box::new(Operator(0, LowerThan(22.0))), |             Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0)))); | ||||||
|             Box::new(Operator(0, GreaterThan(44.0))), |  | ||||||
|         ); |  | ||||||
|         assert_eq!(condition, expected); |         assert_eq!(condition, expected); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -566,29 +603,33 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = Settings::new(&mut wtxn, &index, 0); |         let mut builder = Settings::new(&mut wtxn, &index, 0); | ||||||
|         builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order |         builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order | ||||||
|         builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); |         builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); | ||||||
|         builder.execute(|_, _| ()).unwrap(); |         builder.execute(|_, _| ()).unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|         // Test that the facet condition is correctly generated. |         // Test that the facet condition is correctly generated. | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|         let condition = FilterCondition::from_str( |         let condition = FilterCondition::from_str( | ||||||
|             &rtxn, &index, |             &rtxn, | ||||||
|  |             &index, | ||||||
|             "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", |             "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", | ||||||
|         ).unwrap(); |         ) | ||||||
|  |         .unwrap(); | ||||||
|         let expected = Or( |         let expected = Or( | ||||||
|             Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), |             Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), | ||||||
|             Box::new(And( |             Box::new(And( | ||||||
|                 Box::new(Operator(1, Between(22.0, 44.0))), |                 Box::new(Operator(1, Between(22.0, 44.0))), | ||||||
|                 Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), |                 Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), | ||||||
|             )) |             )), | ||||||
|         ); |         ); | ||||||
|         assert_eq!(condition, expected); |         assert_eq!(condition, expected); | ||||||
|  |  | ||||||
|         let condition = FilterCondition::from_str( |         let condition = FilterCondition::from_str( | ||||||
|             &rtxn, &index, |             &rtxn, | ||||||
|  |             &index, | ||||||
|             "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", |             "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", | ||||||
|         ).unwrap(); |         ) | ||||||
|  |         .unwrap(); | ||||||
|         let expected = Or( |         let expected = Or( | ||||||
|             Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), |             Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), | ||||||
|             Box::new(Or( |             Box::new(Or( | ||||||
| @@ -613,20 +654,28 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = Settings::new(&mut wtxn, &index, 0); |         let mut builder = Settings::new(&mut wtxn, &index, 0); | ||||||
|         builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order |         builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order | ||||||
|         builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); |         builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); | ||||||
|         builder.execute(|_, _| ()).unwrap(); |         builder.execute(|_, _| ()).unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|         // Test that the facet condition is correctly generated. |         // Test that the facet condition is correctly generated. | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|         let condition = FilterCondition::from_array( |         let condition = FilterCondition::from_array( | ||||||
|             &rtxn, &index, |             &rtxn, | ||||||
|             vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], |             &index, | ||||||
|         ).unwrap().unwrap(); |             vec![ | ||||||
|  |                 Either::Right("channel = gotaga"), | ||||||
|  |                 Either::Left(vec!["timestamp = 44", "channel != ponce"]), | ||||||
|  |             ], | ||||||
|  |         ) | ||||||
|  |         .unwrap() | ||||||
|  |         .unwrap(); | ||||||
|         let expected = FilterCondition::from_str( |         let expected = FilterCondition::from_str( | ||||||
|             &rtxn, &index, |             &rtxn, | ||||||
|  |             &index, | ||||||
|             "channel = gotaga AND (timestamp = 44 OR channel != ponce)", |             "channel = gotaga AND (timestamp = 44 OR channel != ponce)", | ||||||
|         ).unwrap(); |         ) | ||||||
|  |         .unwrap(); | ||||||
|         assert_eq!(condition, expected); |         assert_eq!(condition, expected); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,20 +1,19 @@ | |||||||
| use std::ops::Bound::{self, Included, Excluded, Unbounded}; | use std::ops::Bound::{self, Excluded, Included, Unbounded}; | ||||||
|  |  | ||||||
| use either::Either::{self, Left, Right}; | use either::Either::{self, Left, Right}; | ||||||
| use heed::types::{DecodeIgnore, ByteSlice}; | use heed::types::{ByteSlice, DecodeIgnore}; | ||||||
| use heed::{Database, RoRange, RoRevRange, LazyDecode}; | use heed::{Database, LazyDecode, RoRange, RoRevRange}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::heed_codec::CboRoaringBitmapCodec; |  | ||||||
| use crate::heed_codec::facet::FacetLevelValueF64Codec; |  | ||||||
| use crate::{Index, FieldId}; |  | ||||||
|  |  | ||||||
| pub use self::facet_distribution::FacetDistribution; | pub use self::facet_distribution::FacetDistribution; | ||||||
| pub use self::filter_condition::{FilterCondition, Operator}; | pub use self::filter_condition::{FilterCondition, Operator}; | ||||||
| pub(crate) use self::parser::Rule as ParserRule; | pub(crate) use self::parser::Rule as ParserRule; | ||||||
|  | use crate::heed_codec::facet::FacetLevelValueF64Codec; | ||||||
|  | use crate::heed_codec::CboRoaringBitmapCodec; | ||||||
|  | use crate::{FieldId, Index}; | ||||||
|  |  | ||||||
| mod filter_condition; |  | ||||||
| mod facet_distribution; | mod facet_distribution; | ||||||
|  | mod filter_condition; | ||||||
| mod parser; | mod parser; | ||||||
|  |  | ||||||
| pub struct FacetRange<'t> { | pub struct FacetRange<'t> { | ||||||
| @@ -30,8 +29,7 @@ impl<'t> FacetRange<'t> { | |||||||
|         level: u8, |         level: u8, | ||||||
|         left: Bound<f64>, |         left: Bound<f64>, | ||||||
|         right: Bound<f64>, |         right: Bound<f64>, | ||||||
|     ) -> heed::Result<FacetRange<'t>> |     ) -> heed::Result<FacetRange<'t>> { | ||||||
|     { |  | ||||||
|         let left_bound = match left { |         let left_bound = match left { | ||||||
|             Included(left) => Included((field_id, level, left, f64::MIN)), |             Included(left) => Included((field_id, level, left, f64::MIN)), | ||||||
|             Excluded(left) => Excluded((field_id, level, left, f64::MIN)), |             Excluded(left) => Excluded((field_id, level, left, f64::MIN)), | ||||||
| @@ -62,7 +60,7 @@ impl<'t> Iterator for FacetRange<'t> { | |||||||
|                 } else { |                 } else { | ||||||
|                     None |                     None | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             Some(Err(e)) => Some(Err(e)), |             Some(Err(e)) => Some(Err(e)), | ||||||
|             None => None, |             None => None, | ||||||
|         } |         } | ||||||
| @@ -82,8 +80,7 @@ impl<'t> FacetRevRange<'t> { | |||||||
|         level: u8, |         level: u8, | ||||||
|         left: Bound<f64>, |         left: Bound<f64>, | ||||||
|         right: Bound<f64>, |         right: Bound<f64>, | ||||||
|     ) -> heed::Result<FacetRevRange<'t>> |     ) -> heed::Result<FacetRevRange<'t>> { | ||||||
|     { |  | ||||||
|         let left_bound = match left { |         let left_bound = match left { | ||||||
|             Included(left) => Included((field_id, level, left, f64::MIN)), |             Included(left) => Included((field_id, level, left, f64::MIN)), | ||||||
|             Excluded(left) => Excluded((field_id, level, left, f64::MIN)), |             Excluded(left) => Excluded((field_id, level, left, f64::MIN)), | ||||||
| @@ -114,7 +111,7 @@ impl<'t> Iterator for FacetRevRange<'t> { | |||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                     continue; |                     continue; | ||||||
|                 }, |                 } | ||||||
|                 Some(Err(e)) => return Some(Err(e)), |                 Some(Err(e)) => return Some(Err(e)), | ||||||
|                 None => return None, |                 None => return None, | ||||||
|             } |             } | ||||||
| @@ -139,11 +136,11 @@ impl<'t> FacetIter<'t> { | |||||||
|         index: &'t Index, |         index: &'t Index, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         documents_ids: RoaringBitmap, |         documents_ids: RoaringBitmap, | ||||||
|     ) -> heed::Result<FacetIter<'t>> |     ) -> heed::Result<FacetIter<'t>> { | ||||||
|     { |  | ||||||
|         let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); |         let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); | ||||||
|         let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); |         let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); | ||||||
|         let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; |         let highest_iter = | ||||||
|  |             FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; | ||||||
|         let level_iters = vec![(documents_ids, Left(highest_iter))]; |         let level_iters = vec![(documents_ids, Left(highest_iter))]; | ||||||
|         Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) |         Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) | ||||||
|     } |     } | ||||||
| @@ -156,11 +153,11 @@ impl<'t> FacetIter<'t> { | |||||||
|         index: &'t Index, |         index: &'t Index, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         documents_ids: RoaringBitmap, |         documents_ids: RoaringBitmap, | ||||||
|     ) -> heed::Result<FacetIter<'t>> |     ) -> heed::Result<FacetIter<'t>> { | ||||||
|     { |  | ||||||
|         let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); |         let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); | ||||||
|         let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); |         let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); | ||||||
|         let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; |         let highest_iter = | ||||||
|  |             FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; | ||||||
|         let level_iters = vec![(documents_ids, Right(highest_iter))]; |         let level_iters = vec![(documents_ids, Right(highest_iter))]; | ||||||
|         Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) |         Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) | ||||||
|     } |     } | ||||||
| @@ -174,11 +171,11 @@ impl<'t> FacetIter<'t> { | |||||||
|         index: &'t Index, |         index: &'t Index, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         documents_ids: RoaringBitmap, |         documents_ids: RoaringBitmap, | ||||||
|     ) -> heed::Result<FacetIter<'t>> |     ) -> heed::Result<FacetIter<'t>> { | ||||||
|     { |  | ||||||
|         let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); |         let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); | ||||||
|         let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); |         let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); | ||||||
|         let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; |         let highest_iter = | ||||||
|  |             FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; | ||||||
|         let level_iters = vec![(documents_ids, Left(highest_iter))]; |         let level_iters = vec![(documents_ids, Left(highest_iter))]; | ||||||
|         Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) |         Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) | ||||||
|     } |     } | ||||||
| @@ -187,12 +184,13 @@ impl<'t> FacetIter<'t> { | |||||||
|         rtxn: &'t heed::RoTxn, |         rtxn: &'t heed::RoTxn, | ||||||
|         db: Database<FacetLevelValueF64Codec, X>, |         db: Database<FacetLevelValueF64Codec, X>, | ||||||
|         fid: FieldId, |         fid: FieldId, | ||||||
|     ) -> heed::Result<Option<u8>> |     ) -> heed::Result<Option<u8>> { | ||||||
|     { |         let level = db | ||||||
|         let level = db.remap_types::<ByteSlice, DecodeIgnore>() |             .remap_types::<ByteSlice, DecodeIgnore>() | ||||||
|             .prefix_iter(rtxn, &[fid][..])? |             .prefix_iter(rtxn, &[fid][..])? | ||||||
|             .remap_key_type::<FacetLevelValueF64Codec>() |             .remap_key_type::<FacetLevelValueF64Codec>() | ||||||
|             .last().transpose()? |             .last() | ||||||
|  |             .transpose()? | ||||||
|             .map(|((_, level, _, _), _)| level); |             .map(|((_, level, _, _), _)| level); | ||||||
|         Ok(level) |         Ok(level) | ||||||
|     } |     } | ||||||
| @@ -215,7 +213,6 @@ impl<'t> Iterator for FacetIter<'t> { | |||||||
|  |  | ||||||
|                 match result { |                 match result { | ||||||
|                     Ok(((_fid, level, left, right), mut docids)) => { |                     Ok(((_fid, level, left, right), mut docids)) => { | ||||||
|  |  | ||||||
|                         docids.intersect_with(&documents_ids); |                         docids.intersect_with(&documents_ids); | ||||||
|                         if !docids.is_empty() { |                         if !docids.is_empty() { | ||||||
|                             if self.must_reduce { |                             if self.must_reduce { | ||||||
| @@ -242,11 +239,11 @@ impl<'t> Iterator for FacetIter<'t> { | |||||||
|                                 Ok(iter) => { |                                 Ok(iter) => { | ||||||
|                                     self.level_iters.push((docids, iter)); |                                     self.level_iters.push((docids, iter)); | ||||||
|                                     continue 'outer; |                                     continue 'outer; | ||||||
|                                 }, |                                 } | ||||||
|                                 Err(e) => return Some(Err(e)), |                                 Err(e) => return Some(Err(e)), | ||||||
|                             } |                             } | ||||||
|                         } |                         } | ||||||
|                     }, |                     } | ||||||
|                     Err(e) => return Some(Err(e)), |                     Err(e) => return Some(Err(e)), | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| use once_cell::sync::Lazy; | use once_cell::sync::Lazy; | ||||||
| use pest::prec_climber::{Operator, Assoc, PrecClimber}; | use pest::prec_climber::{Assoc, Operator, PrecClimber}; | ||||||
|  |  | ||||||
| pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| { | pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| { | ||||||
|     use Assoc::*; |     use Assoc::*; | ||||||
|   | |||||||
| @@ -1,13 +1,11 @@ | |||||||
| use std::collections::HashSet; |  | ||||||
| use std::cmp::{min, Reverse}; | use std::cmp::{min, Reverse}; | ||||||
| use std::collections::BTreeMap; | use std::collections::{BTreeMap, HashSet}; | ||||||
| use std::ops::{Index, IndexMut}; | use std::ops::{Index, IndexMut}; | ||||||
|  |  | ||||||
| use levenshtein_automata::{DFA, Distance}; | use levenshtein_automata::{Distance, DFA}; | ||||||
|  |  | ||||||
| use crate::search::query_tree::{Operation, Query}; |  | ||||||
|  |  | ||||||
| use super::build_dfa; | use super::build_dfa; | ||||||
|  | use crate::search::query_tree::{Operation, Query}; | ||||||
|  |  | ||||||
| type IsPrefix = bool; | type IsPrefix = bool; | ||||||
|  |  | ||||||
| @@ -28,7 +26,9 @@ impl MatchingWords { | |||||||
|             .collect(); |             .collect(); | ||||||
|         // Sort word by len in DESC order prioritizing the longuest word, |         // Sort word by len in DESC order prioritizing the longuest word, | ||||||
|         // in order to highlight the longuest part of the matched word. |         // in order to highlight the longuest part of the matched word. | ||||||
|         dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len())); |         dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| { | ||||||
|  |             Reverse(query_word.len()) | ||||||
|  |         }); | ||||||
|         Self { dfas } |         Self { dfas } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -37,12 +37,13 @@ impl MatchingWords { | |||||||
|         self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { |         self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { | ||||||
|             Distance::Exact(t) if t <= *typo => { |             Distance::Exact(t) if t <= *typo => { | ||||||
|                 if *is_prefix { |                 if *is_prefix { | ||||||
|                     let (_dist, len) = prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); |                     let (_dist, len) = | ||||||
|  |                         prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); | ||||||
|                     Some(len) |                     Some(len) | ||||||
|                 } else { |                 } else { | ||||||
|                     Some(word.len()) |                     Some(word.len()) | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             _otherwise => None, |             _otherwise => None, | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
| @@ -54,11 +55,11 @@ fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { | |||||||
|         match tree { |         match tree { | ||||||
|             Operation::Or(_, ops) | Operation::And(ops) => { |             Operation::Or(_, ops) | Operation::And(ops) => { | ||||||
|                 ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); |                 ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); | ||||||
|             }, |             } | ||||||
|             Operation::Query(Query { prefix, kind }) => { |             Operation::Query(Query { prefix, kind }) => { | ||||||
|                 let typo = if kind.is_exact() { 0 } else { kind.typo() }; |                 let typo = if kind.is_exact() { 0 } else { kind.typo() }; | ||||||
|                 out.insert((kind.word(), typo, *prefix)); |                 out.insert((kind.word(), typo, *prefix)); | ||||||
|             }, |             } | ||||||
|             Operation::Phrase(words) => { |             Operation::Phrase(words) => { | ||||||
|                 for word in words { |                 for word in words { | ||||||
|                     out.insert((word, 0, false)); |                     out.insert((word, 0, false)); | ||||||
| @@ -80,10 +81,7 @@ struct N2Array<T> { | |||||||
|  |  | ||||||
| impl<T: Clone> N2Array<T> { | impl<T: Clone> N2Array<T> { | ||||||
|     fn new(x: usize, y: usize, value: T) -> N2Array<T> { |     fn new(x: usize, y: usize, value: T) -> N2Array<T> { | ||||||
|         N2Array { |         N2Array { y_size: y, buf: vec![value; x * y] } | ||||||
|             y_size: y, |  | ||||||
|             buf: vec![value; x * y], |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -178,9 +176,8 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { | |||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     use crate::MatchingWords; |  | ||||||
|     use crate::search::query_tree::{Operation, Query, QueryKind}; |     use crate::search::query_tree::{Operation, Query, QueryKind}; | ||||||
|  |     use crate::MatchingWords; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn matched_length() { |     fn matched_length() { | ||||||
| @@ -194,13 +191,23 @@ mod tests { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn matching_words() { |     fn matching_words() { | ||||||
|         let query_tree = Operation::Or(false, vec![ |         let query_tree = Operation::Or( | ||||||
|             Operation::And(vec![ |             false, | ||||||
|                 Operation::Query(Query { prefix: true, kind: QueryKind::exact("split".to_string()) }), |             vec![Operation::And(vec![ | ||||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), |                 Operation::Query(Query { | ||||||
|                 Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "world".to_string()) }), |                     prefix: true, | ||||||
|             ]), |                     kind: QueryKind::exact("split".to_string()), | ||||||
|         ]); |                 }), | ||||||
|  |                 Operation::Query(Query { | ||||||
|  |                     prefix: false, | ||||||
|  |                     kind: QueryKind::exact("this".to_string()), | ||||||
|  |                 }), | ||||||
|  |                 Operation::Query(Query { | ||||||
|  |                     prefix: true, | ||||||
|  |                     kind: QueryKind::tolerant(1, "world".to_string()), | ||||||
|  |                 }), | ||||||
|  |             ])], | ||||||
|  |         ); | ||||||
|  |  | ||||||
|         let matching_words = MatchingWords::from_query_tree(&query_tree); |         let matching_words = MatchingWords::from_query_tree(&query_tree); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ use std::result::Result as StdResult; | |||||||
| use std::str::Utf8Error; | use std::str::Utf8Error; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
|  |  | ||||||
|  | use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; | ||||||
| use fst::{IntoStreamer, Streamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; | use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; | ||||||
| use log::debug; | use log::debug; | ||||||
| @@ -13,16 +14,13 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; | |||||||
| use once_cell::sync::Lazy; | use once_cell::sync::Lazy; | ||||||
| use roaring::bitmap::RoaringBitmap; | use roaring::bitmap::RoaringBitmap; | ||||||
|  |  | ||||||
|  | pub(crate) use self::facet::ParserRule; | ||||||
|  | pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator}; | ||||||
|  | pub use self::matching_words::MatchingWords; | ||||||
|  | use self::query_tree::QueryTreeBuilder; | ||||||
| use crate::error::FieldIdMapMissingEntry; | use crate::error::FieldIdMapMissingEntry; | ||||||
| use crate::search::criteria::r#final::{Final, FinalResult}; | use crate::search::criteria::r#final::{Final, FinalResult}; | ||||||
| use crate::{Index, DocumentId, Result}; | use crate::{DocumentId, Index, Result}; | ||||||
|  |  | ||||||
| pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator}; |  | ||||||
| pub use self::matching_words::MatchingWords; |  | ||||||
| pub(crate) use self::facet::ParserRule; |  | ||||||
| use self::query_tree::QueryTreeBuilder; |  | ||||||
|  |  | ||||||
| use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; |  | ||||||
|  |  | ||||||
| // Building these factories is not free. | // Building these factories is not free. | ||||||
| static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); | static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); | ||||||
| @@ -32,8 +30,8 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | |||||||
| mod criteria; | mod criteria; | ||||||
| mod distinct; | mod distinct; | ||||||
| mod facet; | mod facet; | ||||||
| mod query_tree; |  | ||||||
| mod matching_words; | mod matching_words; | ||||||
|  | mod query_tree; | ||||||
|  |  | ||||||
| pub struct Search<'a> { | pub struct Search<'a> { | ||||||
|     query: Option<String>, |     query: Option<String>, | ||||||
| @@ -117,7 +115,7 @@ impl<'a> Search<'a> { | |||||||
|                 let result = analyzer.analyze(query); |                 let result = analyzer.analyze(query); | ||||||
|                 let tokens = result.tokens(); |                 let tokens = result.tokens(); | ||||||
|                 builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) |                 builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) | ||||||
|             }, |             } | ||||||
|             None => (None, None), |             None => (None, None), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
| @@ -144,10 +142,11 @@ impl<'a> Search<'a> { | |||||||
|             None => self.perform_sort(NoopDistinct, matching_words, criteria), |             None => self.perform_sort(NoopDistinct, matching_words, criteria), | ||||||
|             Some(name) => { |             Some(name) => { | ||||||
|                 let field_ids_map = self.index.fields_ids_map(self.rtxn)?; |                 let field_ids_map = self.index.fields_ids_map(self.rtxn)?; | ||||||
|                 let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { |                 let id = | ||||||
|                     field_name: name.to_string(), |                     field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { | ||||||
|                     process: "distinct attribute", |                         field_name: name.to_string(), | ||||||
|                 })?; |                         process: "distinct attribute", | ||||||
|  |                     })?; | ||||||
|                 let distinct = FacetDistinct::new(id, self.index, self.rtxn); |                 let distinct = FacetDistinct::new(id, self.index, self.rtxn); | ||||||
|                 self.perform_sort(distinct, matching_words, criteria) |                 self.perform_sort(distinct, matching_words, criteria) | ||||||
|             } |             } | ||||||
| @@ -159,14 +158,15 @@ impl<'a> Search<'a> { | |||||||
|         mut distinct: D, |         mut distinct: D, | ||||||
|         matching_words: MatchingWords, |         matching_words: MatchingWords, | ||||||
|         mut criteria: Final, |         mut criteria: Final, | ||||||
|     ) -> Result<SearchResult> |     ) -> Result<SearchResult> { | ||||||
|     { |  | ||||||
|         let mut offset = self.offset; |         let mut offset = self.offset; | ||||||
|         let mut initial_candidates = RoaringBitmap::new(); |         let mut initial_candidates = RoaringBitmap::new(); | ||||||
|         let mut excluded_candidates = RoaringBitmap::new(); |         let mut excluded_candidates = RoaringBitmap::new(); | ||||||
|         let mut documents_ids = Vec::with_capacity(self.limit); |         let mut documents_ids = Vec::with_capacity(self.limit); | ||||||
|  |  | ||||||
|         while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? { |         while let Some(FinalResult { candidates, bucket_candidates, .. }) = | ||||||
|  |             criteria.next(&excluded_candidates)? | ||||||
|  |         { | ||||||
|             debug!("Number of candidates found {}", candidates.len()); |             debug!("Number of candidates found {}", candidates.len()); | ||||||
|  |  | ||||||
|             let excluded = take(&mut excluded_candidates); |             let excluded = take(&mut excluded_candidates); | ||||||
| @@ -183,7 +183,9 @@ impl<'a> Search<'a> { | |||||||
|             for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { |             for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { | ||||||
|                 documents_ids.push(candidate?); |                 documents_ids.push(candidate?); | ||||||
|             } |             } | ||||||
|             if documents_ids.len() == self.limit { break } |             if documents_ids.len() == self.limit { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|             excluded_candidates = candidates.into_excluded(); |             excluded_candidates = candidates.into_excluded(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -247,7 +249,7 @@ pub fn word_derivations<'c>( | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             Ok(entry.insert(derived_words)) |             Ok(entry.insert(derived_words)) | ||||||
|         }, |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,6 +1,7 @@ | |||||||
| use std::iter::{Chain, FromIterator}; | use std::iter::{Chain, FromIterator}; | ||||||
| use std::ops::RangeInclusive; | use std::ops::RangeInclusive; | ||||||
| use roaring::bitmap::{RoaringBitmap, IntoIter}; |  | ||||||
|  | use roaring::bitmap::{IntoIter, RoaringBitmap}; | ||||||
|  |  | ||||||
| pub struct AvailableDocumentsIds { | pub struct AvailableDocumentsIds { | ||||||
|     iter: Chain<IntoIter, RangeInclusive<u32>>, |     iter: Chain<IntoIter, RangeInclusive<u32>>, | ||||||
| @@ -18,16 +19,12 @@ impl AvailableDocumentsIds { | |||||||
|                     None => 1..=0, // empty range iterator |                     None => 1..=0, // empty range iterator | ||||||
|                 }; |                 }; | ||||||
|  |  | ||||||
|                 AvailableDocumentsIds { |                 AvailableDocumentsIds { iter: available.into_iter().chain(iter) } | ||||||
|                     iter: available.into_iter().chain(iter), |             } | ||||||
|                 } |  | ||||||
|             }, |  | ||||||
|             None => { |             None => { | ||||||
|                 let empty = RoaringBitmap::new().into_iter(); |                 let empty = RoaringBitmap::new().into_iter(); | ||||||
|                 AvailableDocumentsIds { |                 AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) } | ||||||
|                     iter: empty.chain(0..=u32::max_value()), |             } | ||||||
|                 } |  | ||||||
|             }, |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| use chrono::Utc; | use chrono::Utc; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result}; | use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result}; | ||||||
|  |  | ||||||
| pub struct ClearDocuments<'t, 'u, 'i> { | pub struct ClearDocuments<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -13,9 +13,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|     pub fn new( |     pub fn new( | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|         update_id: u64 |         update_id: u64, | ||||||
|     ) -> ClearDocuments<'t, 'u, 'i> { |     ) -> ClearDocuments<'t, 'u, 'i> { | ||||||
|  |  | ||||||
|         ClearDocuments { wtxn, index, _update_id: update_id } |         ClearDocuments { wtxn, index, _update_id: update_id } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -80,8 +79,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
| mod tests { | mod tests { | ||||||
|     use heed::EnvOpenOptions; |     use heed::EnvOpenOptions; | ||||||
|  |  | ||||||
|     use crate::update::{IndexDocuments, UpdateFormat}; |  | ||||||
|     use super::*; |     use super::*; | ||||||
|  |     use crate::update::{IndexDocuments, UpdateFormat}; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn clear_documents() { |     fn clear_documents() { | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| use std::collections::HashMap; |  | ||||||
| use std::collections::hash_map::Entry; | use std::collections::hash_map::Entry; | ||||||
|  | use std::collections::HashMap; | ||||||
|  |  | ||||||
| use chrono::Utc; | use chrono::Utc; | ||||||
| use fst::IntoStreamer; | use fst::IntoStreamer; | ||||||
| @@ -7,11 +7,11 @@ use heed::types::{ByteSlice, Unit}; | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| use crate::error::{InternalError, FieldIdMapMissingEntry, UserError}; | use super::ClearDocuments; | ||||||
|  | use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; | ||||||
| use crate::heed_codec::CboRoaringBitmapCodec; | use crate::heed_codec::CboRoaringBitmapCodec; | ||||||
| use crate::index::{db_name, main_key}; | use crate::index::{db_name, main_key}; | ||||||
| use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; | use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; | ||||||
| use super::ClearDocuments; |  | ||||||
|  |  | ||||||
| pub struct DeleteDocuments<'t, 'u, 'i> { | pub struct DeleteDocuments<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -26,11 +26,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|         update_id: u64, |         update_id: u64, | ||||||
|     ) -> Result<DeleteDocuments<'t, 'u, 'i>> |     ) -> Result<DeleteDocuments<'t, 'u, 'i>> { | ||||||
|     { |         let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); | ||||||
|         let external_documents_ids = index |  | ||||||
|             .external_documents_ids(wtxn)? |  | ||||||
|             .into_static(); |  | ||||||
|  |  | ||||||
|         Ok(DeleteDocuments { |         Ok(DeleteDocuments { | ||||||
|             wtxn, |             wtxn, | ||||||
| @@ -84,12 +81,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|                 key: Some(main_key::PRIMARY_KEY_KEY), |                 key: Some(main_key::PRIMARY_KEY_KEY), | ||||||
|             } |             } | ||||||
|         })?; |         })?; | ||||||
|         let id_field = fields_ids_map.id(primary_key).ok_or_else(|| { |         let id_field = | ||||||
|             FieldIdMapMissingEntry::FieldName { |             fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName { | ||||||
|                 field_name: primary_key.to_string(), |                 field_name: primary_key.to_string(), | ||||||
|                 process: "DeleteDocuments::execute", |                 process: "DeleteDocuments::execute", | ||||||
|             } |             })?; | ||||||
|         })?; |  | ||||||
|  |  | ||||||
|         let Index { |         let Index { | ||||||
|             env: _env, |             env: _env, | ||||||
| @@ -130,7 +126,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|                     let external_id = match serde_json::from_slice(content).unwrap() { |                     let external_id = match serde_json::from_slice(content).unwrap() { | ||||||
|                         Value::String(string) => SmallString32::from(string.as_str()), |                         Value::String(string) => SmallString32::from(string.as_str()), | ||||||
|                         Value::Number(number) => SmallString32::from(number.to_string()), |                         Value::Number(number) => SmallString32::from(number.to_string()), | ||||||
|                         document_id => return Err(UserError::InvalidDocumentId { document_id }.into()), |                         document_id => { | ||||||
|  |                             return Err(UserError::InvalidDocumentId { document_id }.into()) | ||||||
|  |                         } | ||||||
|                     }; |                     }; | ||||||
|                     external_ids.push(external_id); |                     external_ids.push(external_id); | ||||||
|                 } |                 } | ||||||
| @@ -160,7 +158,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { |             if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { | ||||||
|                 match entry.get().checked_sub(count_diff) { |                 match entry.get().checked_sub(count_diff) { | ||||||
|                     Some(0) | None => entry.remove(), |                     Some(0) | None => entry.remove(), | ||||||
|                     Some(count) => entry.insert(count) |                     Some(count) => entry.insert(count), | ||||||
|                 }; |                 }; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -206,9 +204,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         // We construct an FST set that contains the words to delete from the words FST. |         // We construct an FST set that contains the words to delete from the words FST. | ||||||
|         let words_to_delete = words.iter().filter_map(|(word, must_remove)| { |         let words_to_delete = | ||||||
|             if *must_remove { Some(word.as_ref()) } else { None } |             words.iter().filter_map( | ||||||
|         }); |                 |(word, must_remove)| { | ||||||
|  |                     if *must_remove { | ||||||
|  |                         Some(word.as_ref()) | ||||||
|  |                     } else { | ||||||
|  |                         None | ||||||
|  |                     } | ||||||
|  |                 }, | ||||||
|  |             ); | ||||||
|         let words_to_delete = fst::Set::from_iter(words_to_delete)?; |         let words_to_delete = fst::Set::from_iter(words_to_delete)?; | ||||||
|  |  | ||||||
|         let new_words_fst = { |         let new_words_fst = { | ||||||
| @@ -285,7 +290,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         // We delete the documents ids that are under the pairs of words, |         // We delete the documents ids that are under the pairs of words, | ||||||
|         // it is faster and use no memory to iterate over all the words pairs than |         // it is faster and use no memory to iterate over all the words pairs than | ||||||
|         // to compute the cartesian product of every words of the deleted documents. |         // to compute the cartesian product of every words of the deleted documents. | ||||||
|         let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?; |         let mut iter = | ||||||
|  |             word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?; | ||||||
|         while let Some(result) = iter.next() { |         while let Some(result) = iter.next() { | ||||||
|             let (bytes, mut docids) = result?; |             let (bytes, mut docids) = result?; | ||||||
|             let previous_len = docids.len(); |             let previous_len = docids.len(); | ||||||
| @@ -300,7 +306,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         drop(iter); |         drop(iter); | ||||||
|  |  | ||||||
|         // We delete the documents ids that are under the word level position docids. |         // We delete the documents ids that are under the word level position docids. | ||||||
|         let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); |         let mut iter = | ||||||
|  |             word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); | ||||||
|         while let Some(result) = iter.next() { |         while let Some(result) = iter.next() { | ||||||
|             let (bytes, mut docids) = result?; |             let (bytes, mut docids) = result?; | ||||||
|             let previous_len = docids.len(); |             let previous_len = docids.len(); | ||||||
| @@ -315,7 +322,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         drop(iter); |         drop(iter); | ||||||
|  |  | ||||||
|         // We delete the documents ids that are under the word prefix level position docids. |         // We delete the documents ids that are under the word prefix level position docids. | ||||||
|         let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); |         let mut iter = | ||||||
|  |             word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); | ||||||
|         while let Some(result) = iter.next() { |         while let Some(result) = iter.next() { | ||||||
|             let (bytes, mut docids) = result?; |             let (bytes, mut docids) = result?; | ||||||
|             let previous_len = docids.len(); |             let previous_len = docids.len(); | ||||||
| @@ -397,12 +405,11 @@ fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( | |||||||
|     convert: F, |     convert: F, | ||||||
| ) -> heed::Result<()> | ) -> heed::Result<()> | ||||||
| where | where | ||||||
|     C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>, |     C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, | ||||||
|     F: Fn(K) -> DocumentId, |     F: Fn(K) -> DocumentId, | ||||||
| { | { | ||||||
|     let mut iter = db.remap_key_type::<ByteSlice>() |     let mut iter = | ||||||
|         .prefix_iter_mut(wtxn, &[field_id])? |         db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>(); | ||||||
|         .remap_key_type::<C>(); |  | ||||||
|  |  | ||||||
|     while let Some(result) = iter.next() { |     while let Some(result) = iter.next() { | ||||||
|         let (key, ()) = result?; |         let (key, ()) = result?; | ||||||
| @@ -441,8 +448,8 @@ where | |||||||
| mod tests { | mod tests { | ||||||
|     use heed::EnvOpenOptions; |     use heed::EnvOpenOptions; | ||||||
|  |  | ||||||
|     use crate::update::{IndexDocuments, UpdateFormat}; |  | ||||||
|     use super::*; |     use super::*; | ||||||
|  |     use crate::update::{IndexDocuments, UpdateFormat}; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn delete_documents_with_numbers_as_primary_key() { |     fn delete_documents_with_numbers_as_primary_key() { | ||||||
|   | |||||||
| @@ -3,17 +3,18 @@ use std::fs::File; | |||||||
| use std::num::NonZeroUsize; | use std::num::NonZeroUsize; | ||||||
|  |  | ||||||
| use chrono::Utc; | use chrono::Utc; | ||||||
| use grenad::{CompressionType, Reader, Writer, FileFuse}; | use grenad::{CompressionType, FileFuse, Reader, Writer}; | ||||||
| use heed::types::{ByteSlice, DecodeIgnore}; | use heed::types::{ByteSlice, DecodeIgnore}; | ||||||
| use heed::{BytesEncode, Error}; | use heed::{BytesEncode, Error}; | ||||||
| use log::debug; | use log::debug; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::error::InternalError; | use crate::error::InternalError; | ||||||
| use crate::heed_codec::CboRoaringBitmapCodec; |  | ||||||
| use crate::heed_codec::facet::FacetLevelValueF64Codec; | use crate::heed_codec::facet::FacetLevelValueF64Codec; | ||||||
| use crate::update::index_documents::WriteMethod; | use crate::heed_codec::CboRoaringBitmapCodec; | ||||||
| use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; | use crate::update::index_documents::{ | ||||||
|  |     create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, | ||||||
|  | }; | ||||||
| use crate::{Index, Result}; | use crate::{Index, Result}; | ||||||
|  |  | ||||||
| pub struct Facets<'t, 'u, 'i> { | pub struct Facets<'t, 'u, 'i> { | ||||||
| @@ -32,8 +33,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | |||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|         update_id: u64, |         update_id: u64, | ||||||
|     ) -> Facets<'t, 'u, 'i> |     ) -> Facets<'t, 'u, 'i> { | ||||||
|     { |  | ||||||
|         Facets { |         Facets { | ||||||
|             wtxn, |             wtxn, | ||||||
|             index, |             index, | ||||||
| @@ -72,11 +72,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | |||||||
|             )?; |             )?; | ||||||
|  |  | ||||||
|             // Clear the facet number levels. |             // Clear the facet number levels. | ||||||
|             clear_field_number_levels( |             clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; | ||||||
|                 self.wtxn, |  | ||||||
|                 self.index.facet_id_f64_docids, |  | ||||||
|                 field_id, |  | ||||||
|             )?; |  | ||||||
|  |  | ||||||
|             // Compute and store the faceted numbers documents ids. |             // Compute and store the faceted numbers documents ids. | ||||||
|             let number_documents_ids = compute_faceted_documents_ids( |             let number_documents_ids = compute_faceted_documents_ids( | ||||||
| @@ -96,8 +92,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | |||||||
|                 field_id, |                 field_id, | ||||||
|             )?; |             )?; | ||||||
|  |  | ||||||
|             self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; |             self.index.put_string_faceted_documents_ids( | ||||||
|             self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; |                 self.wtxn, | ||||||
|  |                 field_id, | ||||||
|  |                 &string_documents_ids, | ||||||
|  |             )?; | ||||||
|  |             self.index.put_number_faceted_documents_ids( | ||||||
|  |                 self.wtxn, | ||||||
|  |                 field_id, | ||||||
|  |                 &number_documents_ids, | ||||||
|  |             )?; | ||||||
|  |  | ||||||
|             write_into_lmdb_database( |             write_into_lmdb_database( | ||||||
|                 self.wtxn, |                 self.wtxn, | ||||||
| @@ -112,12 +116,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn clear_field_number_levels<'t, >( | fn clear_field_number_levels<'t>( | ||||||
|     wtxn: &'t mut heed::RwTxn, |     wtxn: &'t mut heed::RwTxn, | ||||||
|     db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, |     db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, | ||||||
|     field_id: u8, |     field_id: u8, | ||||||
| ) -> heed::Result<()> | ) -> heed::Result<()> { | ||||||
| { |  | ||||||
|     let left = (field_id, 1, f64::MIN, f64::MIN); |     let left = (field_id, 1, f64::MIN, f64::MIN); | ||||||
|     let right = (field_id, u8::MAX, f64::MAX, f64::MAX); |     let right = (field_id, u8::MAX, f64::MAX, f64::MAX); | ||||||
|     let range = left..=right; |     let range = left..=right; | ||||||
| @@ -133,8 +136,7 @@ fn compute_facet_number_levels<'t>( | |||||||
|     level_group_size: NonZeroUsize, |     level_group_size: NonZeroUsize, | ||||||
|     min_level_size: NonZeroUsize, |     min_level_size: NonZeroUsize, | ||||||
|     field_id: u8, |     field_id: u8, | ||||||
| ) -> Result<Reader<FileFuse>> | ) -> Result<Reader<FileFuse>> { | ||||||
| { |  | ||||||
|     let first_level_size = db |     let first_level_size = db | ||||||
|         .remap_key_type::<ByteSlice>() |         .remap_key_type::<ByteSlice>() | ||||||
|         .prefix_iter(rtxn, &[field_id])? |         .prefix_iter(rtxn, &[field_id])? | ||||||
| @@ -143,9 +145,8 @@ fn compute_facet_number_levels<'t>( | |||||||
|  |  | ||||||
|     // It is forbidden to keep a cursor and write in a database at the same time with LMDB |     // It is forbidden to keep a cursor and write in a database at the same time with LMDB | ||||||
|     // therefore we write the facet levels entries into a grenad file before transfering them. |     // therefore we write the facet levels entries into a grenad file before transfering them. | ||||||
|     let mut writer = tempfile::tempfile().and_then(|file| { |     let mut writer = tempfile::tempfile() | ||||||
|         create_writer(compression_type, compression_level, file) |         .and_then(|file| create_writer(compression_type, compression_level, file))?; | ||||||
|     })?; |  | ||||||
|  |  | ||||||
|     let level_0_range = { |     let level_0_range = { | ||||||
|         let left = (field_id, 0, f64::MIN, f64::MIN); |         let left = (field_id, 0, f64::MIN, f64::MIN); | ||||||
| @@ -196,8 +197,7 @@ fn compute_faceted_documents_ids( | |||||||
|     rtxn: &heed::RoTxn, |     rtxn: &heed::RoTxn, | ||||||
|     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, |     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||||
|     field_id: u8, |     field_id: u8, | ||||||
| ) -> Result<RoaringBitmap> | ) -> Result<RoaringBitmap> { | ||||||
| { |  | ||||||
|     let mut documents_ids = RoaringBitmap::new(); |     let mut documents_ids = RoaringBitmap::new(); | ||||||
|  |  | ||||||
|     for result in db.prefix_iter(rtxn, &[field_id])? { |     for result in db.prefix_iter(rtxn, &[field_id])? { | ||||||
| @@ -215,8 +215,7 @@ fn write_number_entry( | |||||||
|     left: f64, |     left: f64, | ||||||
|     right: f64, |     right: f64, | ||||||
|     ids: &RoaringBitmap, |     ids: &RoaringBitmap, | ||||||
| ) -> Result<()> | ) -> Result<()> { | ||||||
| { |  | ||||||
|     let key = (field_id, level, left, right); |     let key = (field_id, level, left, right); | ||||||
|     let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; |     let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||||
|     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; |     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::collections::HashSet; | use std::collections::HashSet; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, Seek, SeekFrom, BufReader, BufRead}; | use std::io::{self, BufRead, BufReader, Seek, SeekFrom}; | ||||||
| use std::num::{NonZeroU32, NonZeroUsize}; | use std::num::{NonZeroU32, NonZeroUsize}; | ||||||
| use std::result::Result as StdResult; | use std::result::Result as StdResult; | ||||||
| use std::str; | use std::str; | ||||||
| @@ -10,28 +10,26 @@ use std::time::Instant; | |||||||
|  |  | ||||||
| use bstr::ByteSlice as _; | use bstr::ByteSlice as _; | ||||||
| use chrono::Utc; | use chrono::Utc; | ||||||
| use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; | use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer}; | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
| use log::{debug, info, error}; | use log::{debug, error, info}; | ||||||
| use memmap::Mmap; | use memmap::Mmap; | ||||||
| use rayon::prelude::*; | use rayon::prelude::*; | ||||||
| use rayon::ThreadPool; | use rayon::ThreadPool; | ||||||
| use serde::{Serialize, Deserialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| use crate::error::{Error, InternalError}; |  | ||||||
| use crate::{Index, Result}; |  | ||||||
| use crate::update::{ |  | ||||||
|     Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, |  | ||||||
|     WordPrefixPairProximityDocids, |  | ||||||
| }; |  | ||||||
| use self::store::{Store, Readers}; |  | ||||||
| pub use self::merge_function::{ | pub use self::merge_function::{ | ||||||
|     fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first |     cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, | ||||||
| }; | }; | ||||||
|  | use self::store::{Readers, Store}; | ||||||
| pub use self::transform::{Transform, TransformOutput}; | pub use self::transform::{Transform, TransformOutput}; | ||||||
|  |  | ||||||
| use crate::MergeFn; |  | ||||||
| use super::UpdateBuilder; | use super::UpdateBuilder; | ||||||
|  | use crate::error::{Error, InternalError}; | ||||||
|  | use crate::update::{ | ||||||
|  |     Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, | ||||||
|  |     WordsLevelPositions, WordsPrefixesFst, | ||||||
|  | }; | ||||||
|  | use crate::{Index, MergeFn, Result}; | ||||||
|  |  | ||||||
| mod merge_function; | mod merge_function; | ||||||
| mod store; | mod store; | ||||||
| @@ -48,7 +46,11 @@ pub enum WriteMethod { | |||||||
|     GetMergePut, |     GetMergePut, | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> { | pub fn create_writer( | ||||||
|  |     typ: CompressionType, | ||||||
|  |     level: Option<u32>, | ||||||
|  |     file: File, | ||||||
|  | ) -> io::Result<Writer<File>> { | ||||||
|     let mut builder = Writer::builder(); |     let mut builder = Writer::builder(); | ||||||
|     builder.compression_type(typ); |     builder.compression_type(typ); | ||||||
|     if let Some(level) = level { |     if let Some(level) = level { | ||||||
| @@ -64,8 +66,7 @@ pub fn create_sorter<E>( | |||||||
|     chunk_fusing_shrink_size: Option<u64>, |     chunk_fusing_shrink_size: Option<u64>, | ||||||
|     max_nb_chunks: Option<usize>, |     max_nb_chunks: Option<usize>, | ||||||
|     max_memory: Option<usize>, |     max_memory: Option<usize>, | ||||||
| ) -> Sorter<MergeFn<E>> | ) -> Sorter<MergeFn<E>> { | ||||||
| { |  | ||||||
|     let mut builder = Sorter::builder(merge); |     let mut builder = Sorter::builder(merge); | ||||||
|     if let Some(shrink_size) = chunk_fusing_shrink_size { |     if let Some(shrink_size) = chunk_fusing_shrink_size { | ||||||
|         builder.file_fusing_shrink_size(shrink_size); |         builder.file_fusing_shrink_size(shrink_size); | ||||||
| @@ -83,7 +84,10 @@ pub fn create_sorter<E>( | |||||||
|     builder.build() |     builder.build() | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Result<Reader<FileFuse>> { | pub fn writer_into_reader( | ||||||
|  |     writer: Writer<File>, | ||||||
|  |     shrink_size: Option<u64>, | ||||||
|  | ) -> Result<Reader<FileFuse>> { | ||||||
|     let mut file = writer.into_inner()?; |     let mut file = writer.into_inner()?; | ||||||
|     file.seek(SeekFrom::Start(0))?; |     file.seek(SeekFrom::Start(0))?; | ||||||
|     let file = if let Some(shrink_size) = shrink_size { |     let file = if let Some(shrink_size) = shrink_size { | ||||||
| @@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Res | |||||||
| pub fn merge_readers<E>( | pub fn merge_readers<E>( | ||||||
|     sources: Vec<Reader<FileFuse>>, |     sources: Vec<Reader<FileFuse>>, | ||||||
|     merge: MergeFn<E>, |     merge: MergeFn<E>, | ||||||
| ) -> Merger<FileFuse, MergeFn<E>> | ) -> Merger<FileFuse, MergeFn<E>> { | ||||||
| { |  | ||||||
|     let mut builder = Merger::builder(merge); |     let mut builder = Merger::builder(merge); | ||||||
|     builder.extend(sources); |     builder.extend(sources); | ||||||
|     builder.build() |     builder.build() | ||||||
| @@ -118,13 +121,7 @@ where | |||||||
|     let before = Instant::now(); |     let before = Instant::now(); | ||||||
|  |  | ||||||
|     let merger = merge_readers(sources, merge); |     let merger = merge_readers(sources, merge); | ||||||
|     merger_iter_into_lmdb_database( |     merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?; | ||||||
|         wtxn, |  | ||||||
|         database, |  | ||||||
|         merger.into_merge_iter()?, |  | ||||||
|         merge, |  | ||||||
|         method, |  | ||||||
|     )?; |  | ||||||
|  |  | ||||||
|     debug!("MTBL stores merged in {:.02?}!", before.elapsed()); |     debug!("MTBL stores merged in {:.02?}!", before.elapsed()); | ||||||
|     Ok(()) |     Ok(()) | ||||||
| @@ -149,7 +146,7 @@ where | |||||||
|             while let Some((k, v)) = reader.next()? { |             while let Some((k, v)) = reader.next()? { | ||||||
|                 out_iter.append(k, v)?; |                 out_iter.append(k, v)?; | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|         WriteMethod::GetMergePut => { |         WriteMethod::GetMergePut => { | ||||||
|             while let Some((k, v)) = reader.next()? { |             while let Some((k, v)) = reader.next()? { | ||||||
|                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; |                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; | ||||||
| @@ -158,11 +155,11 @@ where | |||||||
|                         let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; |                         let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; | ||||||
|                         let val = merge(k, &vals)?; |                         let val = merge(k, &vals)?; | ||||||
|                         iter.put_current(k, &val)?; |                         iter.put_current(k, &val)?; | ||||||
|                     }, |                     } | ||||||
|                     _ => { |                     _ => { | ||||||
|                         drop(iter); |                         drop(iter); | ||||||
|                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; |                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||||
|                     }, |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database<E>( | |||||||
| ) -> Result<()> | ) -> Result<()> | ||||||
| where | where | ||||||
|     Error: From<E>, |     Error: From<E>, | ||||||
|     Error: From<grenad::Error<E>> |     Error: From<grenad::Error<E>>, | ||||||
| { | { | ||||||
|     debug!("Writing MTBL sorter..."); |     debug!("Writing MTBL sorter..."); | ||||||
|     let before = Instant::now(); |     let before = Instant::now(); | ||||||
|  |  | ||||||
|     merger_iter_into_lmdb_database( |     merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?; | ||||||
|         wtxn, |  | ||||||
|         database, |  | ||||||
|         sorter.into_iter()?, |  | ||||||
|         merge, |  | ||||||
|         method, |  | ||||||
|     )?; |  | ||||||
|  |  | ||||||
|     debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); |     debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); | ||||||
|     Ok(()) |     Ok(()) | ||||||
| @@ -214,7 +205,7 @@ where | |||||||
|             while let Some((k, v)) = sorter.next()? { |             while let Some((k, v)) = sorter.next()? { | ||||||
|                 out_iter.append(k, v)?; |                 out_iter.append(k, v)?; | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|         WriteMethod::GetMergePut => { |         WriteMethod::GetMergePut => { | ||||||
|             while let Some((k, v)) = sorter.next()? { |             while let Some((k, v)) = sorter.next()? { | ||||||
|                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; |                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; | ||||||
| @@ -226,14 +217,14 @@ where | |||||||
|                             InternalError::IndexingMergingKeys { process: "get-put-merge" } |                             InternalError::IndexingMergingKeys { process: "get-put-merge" } | ||||||
|                         })?; |                         })?; | ||||||
|                         iter.put_current(k, &val)?; |                         iter.put_current(k, &val)?; | ||||||
|                     }, |                     } | ||||||
|                     _ => { |                     _ => { | ||||||
|                         drop(iter); |                         drop(iter); | ||||||
|                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; |                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||||
|                     }, |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         }, |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(()) |     Ok(()) | ||||||
| @@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|  |  | ||||||
|         // Early return when there is no document to add |         // Early return when there is no document to add | ||||||
|         if reader.buffer().is_empty() { |         if reader.buffer().is_empty() { | ||||||
|             return Ok(DocumentAdditionResult { |             return Ok(DocumentAdditionResult { nb_documents: 0 }); | ||||||
|                 nb_documents: 0, |  | ||||||
|             }) |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         self.index.set_updated_at(self.wtxn, &Utc::now())?; |         self.index.set_updated_at(self.wtxn, &Utc::now())?; | ||||||
| @@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|         let output = match self.update_format { |         let output = match self.update_format { | ||||||
|             UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?, |             UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?, | ||||||
|             UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?, |             UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?, | ||||||
|             UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?, |             UpdateFormat::JsonStream => { | ||||||
|  |                 transform.output_from_json_stream(reader, &progress_callback)? | ||||||
|  |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let nb_documents = output.documents_count; |         let nb_documents = output.documents_count; | ||||||
| @@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|  |  | ||||||
|     pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()> |     pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()> | ||||||
|     where |     where | ||||||
|         F: Fn(UpdateIndexingStep) + Sync |         F: Fn(UpdateIndexingStep) + Sync, | ||||||
|     { |     { | ||||||
|         let before_indexing = Instant::now(); |         let before_indexing = Instant::now(); | ||||||
|  |  | ||||||
| @@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                 // settings if none have already been set. |                 // settings if none have already been set. | ||||||
|                 backup_pool = rayon::ThreadPoolBuilder::new().build()?; |                 backup_pool = rayon::ThreadPoolBuilder::new().build()?; | ||||||
|                 &backup_pool |                 &backup_pool | ||||||
|             }, |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let readers = pool.install(|| { |         let readers = pool.install(|| { | ||||||
| @@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|  |  | ||||||
|         let mut documents_ids = self.index.documents_ids(self.wtxn)?; |         let mut documents_ids = self.index.documents_ids(self.wtxn)?; | ||||||
|         let contains_documents = !documents_ids.is_empty(); |         let contains_documents = !documents_ids.is_empty(); | ||||||
|         let write_method = if contains_documents { |         let write_method = | ||||||
|             WriteMethod::GetMergePut |             if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append }; | ||||||
|         } else { |  | ||||||
|             WriteMethod::Append |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         debug!("Writing using the write method: {:?}", write_method); |         debug!("Writing using the write method: {:?}", write_method); | ||||||
|  |  | ||||||
| @@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|             *self.index.docid_word_positions.as_polymorph(), |             *self.index.docid_word_positions.as_polymorph(), | ||||||
|             docid_word_positions_readers, |             docid_word_positions_readers, | ||||||
|             keep_first, |             keep_first, | ||||||
|             write_method |             write_method, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         database_count += 1; |         database_count += 1; | ||||||
| @@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|             *self.index.documents.as_polymorph(), |             *self.index.documents.as_polymorph(), | ||||||
|             documents_readers, |             documents_readers, | ||||||
|             keep_first, |             keep_first, | ||||||
|             write_method |             write_method, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         database_count += 1; |         database_count += 1; | ||||||
| @@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                         fst_merge, |                         fst_merge, | ||||||
|                         WriteMethod::GetMergePut, |                         WriteMethod::GetMergePut, | ||||||
|                     )?; |                     )?; | ||||||
|                 }, |                 } | ||||||
|                 DatabaseType::WordDocids => { |                 DatabaseType::WordDocids => { | ||||||
|                     debug!("Writing the words docids into LMDB on disk..."); |                     debug!("Writing the words docids into LMDB on disk..."); | ||||||
|                     let db = *self.index.word_docids.as_polymorph(); |                     let db = *self.index.word_docids.as_polymorph(); | ||||||
| @@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                         roaring_bitmap_merge, |                         roaring_bitmap_merge, | ||||||
|                         write_method, |                         write_method, | ||||||
|                     )?; |                     )?; | ||||||
|                 }, |                 } | ||||||
|                 DatabaseType::FacetLevel0NumbersDocids => { |                 DatabaseType::FacetLevel0NumbersDocids => { | ||||||
|                     debug!("Writing the facet numbers docids into LMDB on disk..."); |                     debug!("Writing the facet numbers docids into LMDB on disk..."); | ||||||
|                     let db = *self.index.facet_id_f64_docids.as_polymorph(); |                     let db = *self.index.facet_id_f64_docids.as_polymorph(); | ||||||
| @@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                         cbo_roaring_bitmap_merge, |                         cbo_roaring_bitmap_merge, | ||||||
|                         write_method, |                         write_method, | ||||||
|                     )?; |                     )?; | ||||||
|                 }, |                 } | ||||||
|                 DatabaseType::FieldIdWordCountDocids => { |                 DatabaseType::FieldIdWordCountDocids => { | ||||||
|                     debug!("Writing the field id word count docids into LMDB on disk..."); |                     debug!("Writing the field id word count docids into LMDB on disk..."); | ||||||
|                     let db = *self.index.field_id_word_count_docids.as_polymorph(); |                     let db = *self.index.field_id_word_count_docids.as_polymorph(); | ||||||
| @@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                         cbo_roaring_bitmap_merge, |                         cbo_roaring_bitmap_merge, | ||||||
|                         write_method, |                         write_method, | ||||||
|                     )?; |                     )?; | ||||||
|                 }, |                 } | ||||||
|                 DatabaseType::WordLevel0PositionDocids => { |                 DatabaseType::WordLevel0PositionDocids => { | ||||||
|                     debug!("Writing the word level 0 positions docids into LMDB on disk..."); |                     debug!("Writing the word level 0 positions docids into LMDB on disk..."); | ||||||
|                     let db = *self.index.word_level_position_docids.as_polymorph(); |                     let db = *self.index.word_level_position_docids.as_polymorph(); | ||||||
| @@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |  | ||||||
|     use heed::EnvOpenOptions; |     use heed::EnvOpenOptions; | ||||||
|  |  | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn simple_document_replacement() { |     fn simple_document_replacement() { | ||||||
|         let path = tempfile::tempdir().unwrap(); |         let path = tempfile::tempdir().unwrap(); | ||||||
| @@ -1053,9 +1042,8 @@ mod tests { | |||||||
|         assert_eq!(count, 3); |         assert_eq!(count, 3); | ||||||
|  |  | ||||||
|         let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); |         let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); | ||||||
|         let (kevin_id, _) = docs.iter().find(|(_, d)| { |         let (kevin_id, _) = | ||||||
|             d.get(0).unwrap() == br#""updated kevin""# |             docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); | ||||||
|         }).unwrap(); |  | ||||||
|         let (id, doc) = docs[*kevin_id as usize]; |         let (id, doc) = docs[*kevin_id as usize]; | ||||||
|         assert_eq!(id, *kevin_id); |         assert_eq!(id, *kevin_id); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -8,25 +8,29 @@ use std::{cmp, iter}; | |||||||
|  |  | ||||||
| use bstr::ByteSlice as _; | use bstr::ByteSlice as _; | ||||||
| use fst::Set; | use fst::Set; | ||||||
| use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; | use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; | ||||||
| use heed::BytesEncode; | use heed::BytesEncode; | ||||||
| use linked_hash_map::LinkedHashMap; | use linked_hash_map::LinkedHashMap; | ||||||
| use log::{debug, info}; | use log::{debug, info}; | ||||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; | use meilisearch_tokenizer::token::SeparatorKind; | ||||||
|  | use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; | ||||||
| use ordered_float::OrderedFloat; | use ordered_float::OrderedFloat; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
| use tempfile::tempfile; | use tempfile::tempfile; | ||||||
|  |  | ||||||
|  | use super::merge_function::{ | ||||||
|  |     cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, | ||||||
|  | }; | ||||||
|  | use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; | ||||||
| use crate::error::{Error, InternalError, SerializationError}; | use crate::error::{Error, InternalError, SerializationError}; | ||||||
| use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; | use crate::heed_codec::facet::{ | ||||||
| use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; |     FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, | ||||||
|  |     FieldDocIdFacetStringCodec, | ||||||
|  | }; | ||||||
| use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||||
| use crate::update::UpdateIndexingStep; | use crate::update::UpdateIndexingStep; | ||||||
| use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result}; | use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32}; | ||||||
|  |  | ||||||
| use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; |  | ||||||
| use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge}; |  | ||||||
|  |  | ||||||
| const LMDB_MAX_KEY_LENGTH: usize = 511; | const LMDB_MAX_KEY_LENGTH: usize = 511; | ||||||
| const ONE_KILOBYTE: usize = 1024 * 1024; | const ONE_KILOBYTE: usize = 1024 * 1024; | ||||||
| @@ -56,7 +60,8 @@ pub struct Store<'s, A> { | |||||||
|     word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, |     word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, | ||||||
|     word_docids_limit: usize, |     word_docids_limit: usize, | ||||||
|     field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, |     field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, | ||||||
|     words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, |     words_pairs_proximities_docids: | ||||||
|  |         LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, | ||||||
|     words_pairs_proximities_docids_limit: usize, |     words_pairs_proximities_docids_limit: usize, | ||||||
|     facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, |     facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, | ||||||
|     facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, |     facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, | ||||||
| @@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         chunk_compression_level: Option<u32>, |         chunk_compression_level: Option<u32>, | ||||||
|         chunk_fusing_shrink_size: Option<u64>, |         chunk_fusing_shrink_size: Option<u64>, | ||||||
|         stop_words: Option<&'s Set<A>>, |         stop_words: Option<&'s Set<A>>, | ||||||
|     ) -> Result<Self> |     ) -> Result<Self> { | ||||||
|     { |  | ||||||
|         // We divide the max memory by the number of sorter the Store have. |         // We divide the max memory by the number of sorter the Store have. | ||||||
|         let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); |         let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); | ||||||
|         let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); |         let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); | ||||||
| @@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|             Some(1024 * 1024 * 1024), // 1MB |             Some(1024 * 1024 * 1024), // 1MB | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         let documents_writer = tempfile().and_then(|f| { |         let documents_writer = tempfile() | ||||||
|             create_writer(chunk_compression_type, chunk_compression_level, f) |             .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; | ||||||
|         })?; |         let docid_word_positions_writer = tempfile() | ||||||
|         let docid_word_positions_writer = tempfile().and_then(|f| { |             .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; | ||||||
|             create_writer(chunk_compression_type, chunk_compression_level, f) |  | ||||||
|         })?; |  | ||||||
|  |  | ||||||
|         let mut config = AnalyzerConfig::default(); |         let mut config = AnalyzerConfig::default(); | ||||||
|         if let Some(stop_words) = stop_words { |         if let Some(stop_words) = stop_words { | ||||||
| @@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|     fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { |     fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { | ||||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. |         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||||
|         match self.word_docids.get_refresh(word.as_bytes()) { |         match self.word_docids.get_refresh(word.as_bytes()) { | ||||||
|             Some(old) => { old.insert(id); }, |             Some(old) => { | ||||||
|  |                 old.insert(id); | ||||||
|  |             } | ||||||
|             None => { |             None => { | ||||||
|                 let word_vec = SmallVec32::from(word.as_bytes()); |                 let word_vec = SmallVec32::from(word.as_bytes()); | ||||||
|                 // A newly inserted element is append at the end of the linked hash map. |                 // A newly inserted element is append at the end of the linked hash map. | ||||||
| @@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         value: OrderedFloat<f64>, |         value: OrderedFloat<f64>, | ||||||
|         id: DocumentId, |         id: DocumentId, | ||||||
|     ) -> Result<()> |     ) -> Result<()> { | ||||||
|     { |  | ||||||
|         let sorter = &mut self.field_id_docid_facet_numbers_sorter; |         let sorter = &mut self.field_id_docid_facet_numbers_sorter; | ||||||
|         Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; |         Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; | ||||||
|  |  | ||||||
|         let key = (field_id, value); |         let key = (field_id, value); | ||||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. |         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||||
|         match self.facet_field_number_docids.get_refresh(&key) { |         match self.facet_field_number_docids.get_refresh(&key) { | ||||||
|             Some(old) => { old.insert(id); }, |             Some(old) => { | ||||||
|  |                 old.insert(id); | ||||||
|  |             } | ||||||
|             None => { |             None => { | ||||||
|                 // A newly inserted element is append at the end of the linked hash map. |                 // A newly inserted element is append at the end of the linked hash map. | ||||||
|                 self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); |                 self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); | ||||||
| @@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         value: String, |         value: String, | ||||||
|         id: DocumentId, |         id: DocumentId, | ||||||
|     ) -> Result<()> |     ) -> Result<()> { | ||||||
|     { |  | ||||||
|         let sorter = &mut self.field_id_docid_facet_strings_sorter; |         let sorter = &mut self.field_id_docid_facet_strings_sorter; | ||||||
|         Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; |         Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; | ||||||
|  |  | ||||||
|         let key = (field_id, value); |         let key = (field_id, value); | ||||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. |         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||||
|         match self.facet_field_string_docids.get_refresh(&key) { |         match self.facet_field_string_docids.get_refresh(&key) { | ||||||
|             Some(old) => { old.insert(id); }, |             Some(old) => { | ||||||
|  |                 old.insert(id); | ||||||
|  |             } | ||||||
|             None => { |             None => { | ||||||
|                 // A newly inserted element is append at the end of the linked hash map. |                 // A newly inserted element is append at the end of the linked hash map. | ||||||
|                 self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); |                 self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); | ||||||
| @@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|     // Save the documents ids under the words pairs proximities that it contains. |     // Save the documents ids under the words pairs proximities that it contains. | ||||||
|     fn insert_words_pairs_proximities_docids<'a>( |     fn insert_words_pairs_proximities_docids<'a>( | ||||||
|         &mut self, |         &mut self, | ||||||
|         words_pairs_proximities: impl IntoIterator<Item=((&'a str, &'a str), u8)>, |         words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>, | ||||||
|         id: DocumentId, |         id: DocumentId, | ||||||
|     ) -> Result<()> |     ) -> Result<()> { | ||||||
|     { |  | ||||||
|         for ((w1, w2), prox) in words_pairs_proximities { |         for ((w1, w2), prox) in words_pairs_proximities { | ||||||
|             let w1 = SmallVec32::from(w1.as_bytes()); |             let w1 = SmallVec32::from(w1.as_bytes()); | ||||||
|             let w2 = SmallVec32::from(w2.as_bytes()); |             let w2 = SmallVec32::from(w2.as_bytes()); | ||||||
| @@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|             // if get_refresh finds the element it is assured |             // if get_refresh finds the element it is assured | ||||||
|             // to be at the end of the linked hash map. |             // to be at the end of the linked hash map. | ||||||
|             match self.words_pairs_proximities_docids.get_refresh(&key) { |             match self.words_pairs_proximities_docids.get_refresh(&key) { | ||||||
|                 Some(old) => { old.insert(id); }, |                 Some(old) => { | ||||||
|  |                     old.insert(id); | ||||||
|  |                 } | ||||||
|                 None => { |                 None => { | ||||||
|                     // A newly inserted element is append at the end of the linked hash map. |                     // A newly inserted element is append at the end of the linked hash map. | ||||||
|                     let ids = RoaringBitmap::from_iter(Some(id)); |                     let ids = RoaringBitmap::from_iter(Some(id)); | ||||||
| @@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|             // Removing front elements is equivalent to removing the LRUs. |             // Removing front elements is equivalent to removing the LRUs. | ||||||
|             let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); |             let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); | ||||||
|             iter.take(overflow).for_each(|x| lrus.push(x)); |             iter.take(overflow).for_each(|x| lrus.push(x)); | ||||||
|             Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?; |             Self::write_words_pairs_proximities( | ||||||
|  |                 &mut self.words_pairs_proximities_docids_sorter, | ||||||
|  |                 lrus, | ||||||
|  |             )?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>, |         facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>, | ||||||
|         facet_strings_values: &mut HashMap<FieldId, Vec<String>>, |         facet_strings_values: &mut HashMap<FieldId, Vec<String>>, | ||||||
|         record: &[u8], |         record: &[u8], | ||||||
|     ) -> Result<()> |     ) -> Result<()> { | ||||||
|     { |  | ||||||
|         // We compute the list of words pairs proximities (self-join) and write it directly to disk. |         // We compute the list of words pairs proximities (self-join) and write it directly to disk. | ||||||
|         let words_pair_proximities = compute_words_pair_proximities(&words_positions); |         let words_pair_proximities = compute_words_pair_proximities(&words_positions); | ||||||
|         self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; |         self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; | ||||||
| @@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         self.documents_writer.insert(document_id.to_be_bytes(), record)?; |         self.documents_writer.insert(document_id.to_be_bytes(), record)?; | ||||||
|         Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; |         Self::write_docid_word_positions( | ||||||
|         Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; |             &mut self.docid_word_positions_writer, | ||||||
|  |             document_id, | ||||||
|  |             words_positions, | ||||||
|  |         )?; | ||||||
|  |         Self::write_word_position_docids( | ||||||
|  |             &mut self.word_level_position_docids_sorter, | ||||||
|  |             document_id, | ||||||
|  |             words_positions, | ||||||
|  |         )?; | ||||||
|  |  | ||||||
|         words_positions.clear(); |         words_positions.clear(); | ||||||
|  |  | ||||||
| @@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|  |  | ||||||
|     fn write_words_pairs_proximities<E>( |     fn write_words_pairs_proximities<E>( | ||||||
|         sorter: &mut Sorter<MergeFn<E>>, |         sorter: &mut Sorter<MergeFn<E>>, | ||||||
|         iter: impl IntoIterator<Item=((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>, |         iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>, | ||||||
|     ) -> Result<()> |     ) -> Result<()> | ||||||
|     where |     where | ||||||
|         Error: From<E>, |         Error: From<E>, | ||||||
| @@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         writer: &mut Writer<File>, |         writer: &mut Writer<File>, | ||||||
|         id: DocumentId, |         id: DocumentId, | ||||||
|         words_positions: &HashMap<String, SmallVec32<Position>>, |         words_positions: &HashMap<String, SmallVec32<Position>>, | ||||||
|     ) -> Result<()> |     ) -> Result<()> { | ||||||
|     { |  | ||||||
|         // We prefix the words by the document id. |         // We prefix the words by the document id. | ||||||
|         let mut key = id.to_be_bytes().to_vec(); |         let mut key = id.to_be_bytes().to_vec(); | ||||||
|         let mut buffer = Vec::new(); |         let mut buffer = Vec::new(); | ||||||
| @@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn write_facet_field_string_docids<I, E>( |     fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> | ||||||
|         sorter: &mut Sorter<MergeFn<E>>, |  | ||||||
|         iter: I, |  | ||||||
|     ) -> Result<()> |  | ||||||
|     where |     where | ||||||
|         I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>, |         I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>, | ||||||
|         Error: From<E>, |         Error: From<E>, | ||||||
|     { |     { | ||||||
|         let mut key_buffer = Vec::new(); |         let mut key_buffer = Vec::new(); | ||||||
| @@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn write_facet_field_number_docids<I, E>( |     fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> | ||||||
|         sorter: &mut Sorter<MergeFn<E>>, |  | ||||||
|         iter: I, |  | ||||||
|     ) -> Result<()> |  | ||||||
|     where |     where | ||||||
|         I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>, |         I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>, | ||||||
|         Error: From<E>, |         Error: From<E>, | ||||||
|     { |     { | ||||||
|         let mut data_buffer = Vec::new(); |         let mut data_buffer = Vec::new(); | ||||||
| @@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|  |  | ||||||
|     fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> |     fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> | ||||||
|     where |     where | ||||||
|         I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>, |         I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>, | ||||||
|         Error: From<E>, |         Error: From<E>, | ||||||
|     { |     { | ||||||
|         let mut key = Vec::new(); |         let mut key = Vec::new(); | ||||||
| @@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         log_every_n: Option<usize>, |         log_every_n: Option<usize>, | ||||||
|         mut progress_callback: F, |         mut progress_callback: F, | ||||||
|     ) -> Result<Readers> |     ) -> Result<Readers> | ||||||
|     where F: FnMut(UpdateIndexingStep), |     where | ||||||
|  |         F: FnMut(UpdateIndexingStep), | ||||||
|     { |     { | ||||||
|         debug!("{:?}: Indexing in a Store...", thread_index); |         debug!("{:?}: Indexing in a Store...", thread_index); | ||||||
|  |  | ||||||
| @@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|             if count % num_threads == thread_index { |             if count % num_threads == thread_index { | ||||||
|                 // This is a log routine that we do every `log_every_n` documents. |                 // This is a log routine that we do every `log_every_n` documents. | ||||||
|                 if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { |                 if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { | ||||||
|                     info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); |                     info!( | ||||||
|  |                         "We have seen {} documents so far ({:.02?}).", | ||||||
|  |                         format_count(count), | ||||||
|  |                         before.elapsed() | ||||||
|  |                     ); | ||||||
|                     progress_callback(UpdateIndexingStep::IndexDocuments { |                     progress_callback(UpdateIndexingStep::IndexDocuments { | ||||||
|                         documents_seen: count, |                         documents_seen: count, | ||||||
|                         total_documents: documents_count, |                         total_documents: documents_count, | ||||||
| @@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 for (attr, content) in document.iter() { |                 for (attr, content) in document.iter() { | ||||||
|                     if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { |                     if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) | ||||||
|                         let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; |                     { | ||||||
|  |                         let value = | ||||||
|  |                             serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; | ||||||
|  |  | ||||||
|                         let (facet_numbers, facet_strings) = extract_facet_values(&value); |                         let (facet_numbers, facet_strings) = extract_facet_values(&value); | ||||||
|                         facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); |                         facet_numbers_values | ||||||
|                         facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings); |                             .entry(attr) | ||||||
|  |                             .or_insert_with(Vec::new) | ||||||
|  |                             .extend(facet_numbers); | ||||||
|  |                         facet_strings_values | ||||||
|  |                             .entry(attr) | ||||||
|  |                             .or_insert_with(Vec::new) | ||||||
|  |                             .extend(facet_strings); | ||||||
|  |  | ||||||
|                         if self.searchable_fields.contains(&attr) { |                         if self.searchable_fields.contains(&attr) { | ||||||
|                             let content = match json_to_string(&value) { |                             let content = match json_to_string(&value) { | ||||||
| @@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|                             for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { |                             for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { | ||||||
|                                 last_pos = Some(pos); |                                 last_pos = Some(pos); | ||||||
|                                 let position = (attr as usize * MAX_POSITION + pos) as u32; |                                 let position = (attr as usize * MAX_POSITION + pos) as u32; | ||||||
|                                 words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); |                                 words_positions | ||||||
|  |                                     .entry(token.text().to_string()) | ||||||
|  |                                     .or_insert_with(SmallVec32::new) | ||||||
|  |                                     .push(position); | ||||||
|                             } |                             } | ||||||
|  |  | ||||||
|                             if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { |                             if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { | ||||||
|                                 let key = (attr, last_pos as u8 + 1); |                                 let key = (attr, last_pos as u8 + 1); | ||||||
|                                 self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id); |                                 self.field_id_word_count_docids | ||||||
|  |                                     .entry(key) | ||||||
|  |                                     .or_insert_with(RoaringBitmap::new) | ||||||
|  |                                     .insert(document_id); | ||||||
|                             } |                             } | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
| @@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|             self.facet_field_string_docids, |             self.facet_field_string_docids, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut word_docids_wtr = | ||||||
|  |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|         let mut builder = fst::SetBuilder::memory(); |         let mut builder = fst::SetBuilder::memory(); | ||||||
|  |  | ||||||
|         let mut iter = self.word_docids_sorter.into_iter()?; |         let mut iter = self.word_docids_sorter.into_iter()?; | ||||||
| @@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
|         let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|         self.main_sorter.write_into(&mut main_wtr)?; |         self.main_sorter.write_into(&mut main_wtr)?; | ||||||
|  |  | ||||||
|         let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut words_pairs_proximities_docids_wtr = | ||||||
|         self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|  |         self.words_pairs_proximities_docids_sorter | ||||||
|  |             .write_into(&mut words_pairs_proximities_docids_wtr)?; | ||||||
|  |  | ||||||
|         let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut word_level_position_docids_wtr = | ||||||
|  |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|         self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; |         self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; | ||||||
|  |  | ||||||
|         let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut field_id_word_count_docids_wtr = | ||||||
|  |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|         self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; |         self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; | ||||||
|  |  | ||||||
|         let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut facet_field_numbers_docids_wtr = | ||||||
|  |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|         self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; |         self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; | ||||||
|  |  | ||||||
|         let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut facet_field_strings_docids_wtr = | ||||||
|  |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|         self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; |         self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; | ||||||
|  |  | ||||||
|         let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut field_id_docid_facet_numbers_wtr = | ||||||
|         self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?; |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|  |         self.field_id_docid_facet_numbers_sorter | ||||||
|  |             .write_into(&mut field_id_docid_facet_numbers_wtr)?; | ||||||
|  |  | ||||||
|         let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; |         let mut field_id_docid_facet_strings_wtr = | ||||||
|         self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?; |             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||||
|  |         self.field_id_docid_facet_strings_sorter | ||||||
|  |             .write_into(&mut field_id_docid_facet_strings_wtr)?; | ||||||
|  |  | ||||||
|         let main = writer_into_reader(main_wtr, shrink_size)?; |         let main = writer_into_reader(main_wtr, shrink_size)?; | ||||||
|         let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; |         let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; | ||||||
|         let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; |         let words_pairs_proximities_docids = | ||||||
|         let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; |             writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; | ||||||
|         let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; |         let word_level_position_docids = | ||||||
|         let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; |             writer_into_reader(word_level_position_docids_wtr, shrink_size)?; | ||||||
|         let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; |         let field_id_word_count_docids = | ||||||
|         let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; |             writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; | ||||||
|         let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; |         let facet_field_numbers_docids = | ||||||
|         let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; |             writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; | ||||||
|  |         let facet_field_strings_docids = | ||||||
|  |             writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; | ||||||
|  |         let field_id_docid_facet_numbers = | ||||||
|  |             writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; | ||||||
|  |         let field_id_docid_facet_strings = | ||||||
|  |             writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; | ||||||
|  |         let docid_word_positions = | ||||||
|  |             writer_into_reader(self.docid_word_positions_writer, shrink_size)?; | ||||||
|         let documents = writer_into_reader(self.documents_writer, shrink_size)?; |         let documents = writer_into_reader(self.documents_writer, shrink_size)?; | ||||||
|  |  | ||||||
|         Ok(Readers { |         Ok(Readers { | ||||||
| @@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | |||||||
| /// close to each other. | /// close to each other. | ||||||
| fn compute_words_pair_proximities( | fn compute_words_pair_proximities( | ||||||
|     word_positions: &HashMap<String, SmallVec32<Position>>, |     word_positions: &HashMap<String, SmallVec32<Position>>, | ||||||
| ) -> HashMap<(&str, &str), u8> | ) -> HashMap<(&str, &str), u8> { | ||||||
| { |  | ||||||
|     use itertools::Itertools; |     use itertools::Itertools; | ||||||
|  |  | ||||||
|     let mut words_pair_proximities = HashMap::new(); |     let mut words_pair_proximities = HashMap::new(); | ||||||
| @@ -828,31 +875,34 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool { | |||||||
| /// take an iterator on tokens and compute their relative position depending on separator kinds | /// take an iterator on tokens and compute their relative position depending on separator kinds | ||||||
| /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, | /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, | ||||||
| /// else we keep the standart proximity of 1 between words. | /// else we keep the standart proximity of 1 between words. | ||||||
| fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> { | fn process_tokens<'a>( | ||||||
|  |     tokens: impl Iterator<Item = Token<'a>>, | ||||||
|  | ) -> impl Iterator<Item = (usize, Token<'a>)> { | ||||||
|     tokens |     tokens | ||||||
|         .skip_while(|token| token.is_separator().is_some()) |         .skip_while(|token| token.is_separator().is_some()) | ||||||
|         .scan((0, None), |(offset, prev_kind), token| { |         .scan((0, None), |(offset, prev_kind), token| { | ||||||
|                 match token.kind { |             match token.kind { | ||||||
|                     TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { |                 TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { | ||||||
|                         *offset += match *prev_kind { |                     *offset += match *prev_kind { | ||||||
|                             Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, |                         Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, | ||||||
|                             Some(_) => 1, |                         Some(_) => 1, | ||||||
|                             None => 0, |                         None => 0, | ||||||
|                         }; |                     }; | ||||||
|                         *prev_kind = Some(token.kind) |                     *prev_kind = Some(token.kind) | ||||||
|                     } |  | ||||||
|                     TokenKind::Separator(SeparatorKind::Hard) => { |  | ||||||
|                         *prev_kind = Some(token.kind); |  | ||||||
|                     } |  | ||||||
|                     TokenKind::Separator(SeparatorKind::Soft) |  | ||||||
|                         if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { |  | ||||||
|                         *prev_kind = Some(token.kind); |  | ||||||
|                     } |  | ||||||
|                     _ => (), |  | ||||||
|                 } |                 } | ||||||
|  |                 TokenKind::Separator(SeparatorKind::Hard) => { | ||||||
|  |                     *prev_kind = Some(token.kind); | ||||||
|  |                 } | ||||||
|  |                 TokenKind::Separator(SeparatorKind::Soft) | ||||||
|  |                     if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => | ||||||
|  |                 { | ||||||
|  |                     *prev_kind = Some(token.kind); | ||||||
|  |                 } | ||||||
|  |                 _ => (), | ||||||
|  |             } | ||||||
|             Some((*offset, token)) |             Some((*offset, token)) | ||||||
|         }) |         }) | ||||||
|     .filter(|(_, t)| t.is_word()) |         .filter(|(_, t)| t.is_word()) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) { | fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) { | ||||||
| @@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) { | |||||||
|         match value { |         match value { | ||||||
|             Value::Null => (), |             Value::Null => (), | ||||||
|             Value::Bool(b) => output_strings.push(b.to_string()), |             Value::Bool(b) => output_strings.push(b.to_string()), | ||||||
|             Value::Number(number) => if let Some(float) = number.as_f64() { |             Value::Number(number) => { | ||||||
|                 output_numbers.push(float); |                 if let Some(float) = number.as_f64() { | ||||||
|             }, |                     output_numbers.push(float); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|             Value::String(string) => { |             Value::String(string) => { | ||||||
|                 let string = string.trim().to_lowercase(); |                 let string = string.trim().to_lowercase(); | ||||||
|                 output_strings.push(string); |                 output_strings.push(string); | ||||||
|             }, |             } | ||||||
|             Value::Array(values) => if can_recurse { |             Value::Array(values) => { | ||||||
|                 for value in values { |                 if can_recurse { | ||||||
|                     inner_extract_facet_values(value, false, output_numbers, output_strings); |                     for value in values { | ||||||
|  |                         inner_extract_facet_values(value, false, output_numbers, output_strings); | ||||||
|  |                     } | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             Value::Object(_) => (), |             Value::Object(_) => (), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -10,14 +10,15 @@ use log::info; | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::{Map, Value}; | use serde_json::{Map, Value}; | ||||||
|  |  | ||||||
| use crate::error::{Error, UserError, InternalError}; |  | ||||||
| use crate::index::db_name; |  | ||||||
| use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv}; |  | ||||||
| use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; |  | ||||||
| use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; |  | ||||||
| use crate::{Index, Result}; |  | ||||||
| use super::merge_function::merge_two_obkvs; | use super::merge_function::merge_two_obkvs; | ||||||
| use super::{create_writer, create_sorter, IndexDocumentsMethod}; | use super::{create_sorter, create_writer, IndexDocumentsMethod}; | ||||||
|  | use crate::error::{Error, InternalError, UserError}; | ||||||
|  | use crate::index::db_name; | ||||||
|  | use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; | ||||||
|  | use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||||
|  | use crate::{ | ||||||
|  |     ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32, | ||||||
|  | }; | ||||||
|  |  | ||||||
| const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | ||||||
|  |  | ||||||
| @@ -64,7 +65,11 @@ impl Transform<'_, '_> { | |||||||
|         self.output_from_generic_json(reader, false, progress_callback) |         self.output_from_generic_json(reader, false, progress_callback) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn output_from_json_stream<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput> |     pub fn output_from_json_stream<R, F>( | ||||||
|  |         self, | ||||||
|  |         reader: R, | ||||||
|  |         progress_callback: F, | ||||||
|  |     ) -> Result<TransformOutput> | ||||||
|     where |     where | ||||||
|         R: Read, |         R: Read, | ||||||
|         F: Fn(UpdateIndexingStep) + Sync, |         F: Fn(UpdateIndexingStep) + Sync, | ||||||
| @@ -86,14 +91,16 @@ impl Transform<'_, '_> { | |||||||
|         let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); |         let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); | ||||||
|  |  | ||||||
|         // Deserialize the whole batch of documents in memory. |         // Deserialize the whole batch of documents in memory. | ||||||
|         let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream { |         let mut documents: Peekable< | ||||||
|  |             Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>, | ||||||
|  |         > = if is_stream { | ||||||
|             let iter = serde_json::Deserializer::from_reader(reader).into_iter(); |             let iter = serde_json::Deserializer::from_reader(reader).into_iter(); | ||||||
|             let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>; |             let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>; | ||||||
|             iter.peekable() |             iter.peekable() | ||||||
|         } else { |         } else { | ||||||
|             let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?; |             let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?; | ||||||
|             let iter = vec.into_iter().map(Ok); |             let iter = vec.into_iter().map(Ok); | ||||||
|             let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>; |             let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>; | ||||||
|             iter.peekable() |             iter.peekable() | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
| @@ -104,15 +111,16 @@ impl Transform<'_, '_> { | |||||||
|             Err(_) => { |             Err(_) => { | ||||||
|                 let error = documents.next().unwrap().unwrap_err(); |                 let error = documents.next().unwrap().unwrap_err(); | ||||||
|                 return Err(UserError::SerdeJson(error).into()); |                 return Err(UserError::SerdeJson(error).into()); | ||||||
|             }, |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); |         let alternative_name = | ||||||
|  |             first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); | ||||||
|         let (primary_key_id, primary_key) = compute_primary_key_pair( |         let (primary_key_id, primary_key) = compute_primary_key_pair( | ||||||
|             self.index.primary_key(self.rtxn)?, |             self.index.primary_key(self.rtxn)?, | ||||||
|             &mut fields_ids_map, |             &mut fields_ids_map, | ||||||
|             alternative_name, |             alternative_name, | ||||||
|             self.autogenerate_docids |             self.autogenerate_docids, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         if documents.peek().is_none() { |         if documents.peek().is_none() { | ||||||
| @@ -173,9 +181,11 @@ impl Transform<'_, '_> { | |||||||
|                 Some(value) => match value { |                 Some(value) => match value { | ||||||
|                     Value::String(string) => Cow::Borrowed(string.as_str()), |                     Value::String(string) => Cow::Borrowed(string.as_str()), | ||||||
|                     Value::Number(number) => Cow::Owned(number.to_string()), |                     Value::Number(number) => Cow::Owned(number.to_string()), | ||||||
|                     content => return Err(UserError::InvalidDocumentId { |                     content => { | ||||||
|                         document_id: content.clone(), |                         return Err( | ||||||
|                     }.into()), |                             UserError::InvalidDocumentId { document_id: content.clone() }.into() | ||||||
|  |                         ) | ||||||
|  |                     } | ||||||
|                 }, |                 }, | ||||||
|                 None => { |                 None => { | ||||||
|                     if !self.autogenerate_docids { |                     if !self.autogenerate_docids { | ||||||
| @@ -183,7 +193,7 @@ impl Transform<'_, '_> { | |||||||
|                     } |                     } | ||||||
|                     let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); |                     let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); | ||||||
|                     Cow::Borrowed(uuid) |                     Cow::Borrowed(uuid) | ||||||
|                 }, |                 } | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             // We iterate in the fields ids ordered. |             // We iterate in the fields ids ordered. | ||||||
| @@ -194,7 +204,8 @@ impl Transform<'_, '_> { | |||||||
|                 // and this should be the document id we return the one we generated. |                 // and this should be the document id we return the one we generated. | ||||||
|                 if let Some(value) = document.get(name) { |                 if let Some(value) = document.get(name) { | ||||||
|                     // We serialize the attribute values. |                     // We serialize the attribute values. | ||||||
|                     serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?; |                     serde_json::to_writer(&mut json_buffer, value) | ||||||
|  |                         .map_err(InternalError::SerdeJson)?; | ||||||
|                     writer.insert(field_id, &json_buffer)?; |                     writer.insert(field_id, &json_buffer)?; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -202,7 +213,8 @@ impl Transform<'_, '_> { | |||||||
|                 if field_id == primary_key_id && validate_document_id(&external_id).is_none() { |                 if field_id == primary_key_id && validate_document_id(&external_id).is_none() { | ||||||
|                     return Err(UserError::InvalidDocumentId { |                     return Err(UserError::InvalidDocumentId { | ||||||
|                         document_id: Value::from(external_id), |                         document_id: Value::from(external_id), | ||||||
|                     }.into()); |                     } | ||||||
|  |                     .into()); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
| @@ -248,9 +260,9 @@ impl Transform<'_, '_> { | |||||||
|         // Extract the position of the primary key in the current headers, None if not found. |         // Extract the position of the primary key in the current headers, None if not found. | ||||||
|         let primary_key_pos = match self.index.primary_key(self.rtxn)? { |         let primary_key_pos = match self.index.primary_key(self.rtxn)? { | ||||||
|             Some(primary_key) => { |             Some(primary_key) => { | ||||||
|                // The primary key is known so we must find the position in the CSV headers. |                 // The primary key is known so we must find the position in the CSV headers. | ||||||
|                headers.iter().position(|h| h == primary_key) |                 headers.iter().position(|h| h == primary_key) | ||||||
|             }, |             } | ||||||
|             None => headers.iter().position(is_primary_key), |             None => headers.iter().position(is_primary_key), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
| @@ -261,7 +273,7 @@ impl Transform<'_, '_> { | |||||||
|             self.index.primary_key(self.rtxn)?, |             self.index.primary_key(self.rtxn)?, | ||||||
|             &mut fields_ids_map, |             &mut fields_ids_map, | ||||||
|             alternative_name, |             alternative_name, | ||||||
|             self.autogenerate_docids |             self.autogenerate_docids, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         // The primary key field is not present in the header, so we need to create it. |         // The primary key field is not present in the header, so we need to create it. | ||||||
| @@ -308,27 +320,30 @@ impl Transform<'_, '_> { | |||||||
|                     // We validate the document id [a-zA-Z0-9\-_]. |                     // We validate the document id [a-zA-Z0-9\-_]. | ||||||
|                     match validate_document_id(&external_id) { |                     match validate_document_id(&external_id) { | ||||||
|                         Some(valid) => valid, |                         Some(valid) => valid, | ||||||
|                         None => return Err(UserError::InvalidDocumentId { |                         None => { | ||||||
|                             document_id: Value::from(external_id), |                             return Err(UserError::InvalidDocumentId { | ||||||
|                         }.into()), |                                 document_id: Value::from(external_id), | ||||||
|  |                             } | ||||||
|  |                             .into()) | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 }, |                 } | ||||||
|                 None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), |                 None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             // When the primary_key_field_id is found in the fields ids list |             // When the primary_key_field_id is found in the fields ids list | ||||||
|             // we return the generated document id instead of the record field. |             // we return the generated document id instead of the record field. | ||||||
|             let iter = fields_ids.iter() |             let iter = fields_ids.iter().map(|(fi, i)| { | ||||||
|                 .map(|(fi, i)| { |                 let field = if *fi == primary_key_id { external_id } else { &record[*i] }; | ||||||
|                     let field = if *fi == primary_key_id { external_id } else { &record[*i] }; |                 (fi, field) | ||||||
|                     (fi, field) |             }); | ||||||
|                 }); |  | ||||||
|  |  | ||||||
|             // We retrieve the field id based on the fields ids map fields ids order. |             // We retrieve the field id based on the fields ids map fields ids order. | ||||||
|             for (field_id, field) in iter { |             for (field_id, field) in iter { | ||||||
|                 // We serialize the attribute values as JSON strings. |                 // We serialize the attribute values as JSON strings. | ||||||
|                 json_buffer.clear(); |                 json_buffer.clear(); | ||||||
|                 serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?; |                 serde_json::to_writer(&mut json_buffer, &field) | ||||||
|  |                     .map_err(InternalError::SerdeJson)?; | ||||||
|                 writer.insert(*field_id, &json_buffer)?; |                 writer.insert(*field_id, &json_buffer)?; | ||||||
|             } |             } | ||||||
|  |  | ||||||
| @@ -410,26 +425,27 @@ impl Transform<'_, '_> { | |||||||
|                         IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), |                         IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), | ||||||
|                         IndexDocumentsMethod::UpdateDocuments => { |                         IndexDocumentsMethod::UpdateDocuments => { | ||||||
|                             let key = BEU32::new(docid); |                             let key = BEU32::new(docid); | ||||||
|                             let base_obkv = self.index.documents.get(&self.rtxn, &key)? |                             let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( | ||||||
|                                 .ok_or(InternalError::DatabaseMissingEntry { |                                 InternalError::DatabaseMissingEntry { | ||||||
|                                     db_name: db_name::DOCUMENTS, |                                     db_name: db_name::DOCUMENTS, | ||||||
|                                     key: None, |                                     key: None, | ||||||
|                                 })?; |                                 }, | ||||||
|  |                             )?; | ||||||
|                             let update_obkv = obkv::KvReader::new(update_obkv); |                             let update_obkv = obkv::KvReader::new(update_obkv); | ||||||
|                             merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); |                             merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); | ||||||
|                             (docid, obkv_buffer.as_slice()) |                             (docid, obkv_buffer.as_slice()) | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 }, |                 } | ||||||
|                 None => { |                 None => { | ||||||
|                     // If this user id is new we add it to the external documents ids map |                     // If this user id is new we add it to the external documents ids map | ||||||
|                     // for new ids and into the list of new documents. |                     // for new ids and into the list of new documents. | ||||||
|                     let new_docid = available_documents_ids.next() |                     let new_docid = | ||||||
|                         .ok_or(UserError::DocumentLimitReached)?; |                         available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?; | ||||||
|                     new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; |                     new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; | ||||||
|                     new_documents_ids.insert(new_docid); |                     new_documents_ids.insert(new_docid); | ||||||
|                     (new_docid, update_obkv) |                     (new_docid, update_obkv) | ||||||
|                 }, |                 } | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             // We insert the document under the documents ids map into the final file. |             // We insert the document under the documents ids map into the final file. | ||||||
| @@ -450,7 +466,8 @@ impl Transform<'_, '_> { | |||||||
|  |  | ||||||
|         // We create a final writer to write the new documents in order from the sorter. |         // We create a final writer to write the new documents in order from the sorter. | ||||||
|         let file = tempfile::tempfile()?; |         let file = tempfile::tempfile()?; | ||||||
|         let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; |         let mut writer = | ||||||
|  |             create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; | ||||||
|  |  | ||||||
|         // Once we have written all the documents into the final sorter, we write the documents |         // Once we have written all the documents into the final sorter, we write the documents | ||||||
|         // into this writer, extract the file and reset the seek to be able to read it again. |         // into this writer, extract the file and reset the seek to be able to read it again. | ||||||
| @@ -485,8 +502,7 @@ impl Transform<'_, '_> { | |||||||
|         primary_key: String, |         primary_key: String, | ||||||
|         old_fields_ids_map: FieldsIdsMap, |         old_fields_ids_map: FieldsIdsMap, | ||||||
|         new_fields_ids_map: FieldsIdsMap, |         new_fields_ids_map: FieldsIdsMap, | ||||||
|     ) -> Result<TransformOutput> |     ) -> Result<TransformOutput> { | ||||||
|     { |  | ||||||
|         let fields_distribution = self.index.fields_distribution(self.rtxn)?; |         let fields_distribution = self.index.fields_distribution(self.rtxn)?; | ||||||
|         let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; |         let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; | ||||||
|         let documents_ids = self.index.documents_ids(self.rtxn)?; |         let documents_ids = self.index.documents_ids(self.rtxn)?; | ||||||
| @@ -494,7 +510,8 @@ impl Transform<'_, '_> { | |||||||
|  |  | ||||||
|         // We create a final writer to write the new documents in order from the sorter. |         // We create a final writer to write the new documents in order from the sorter. | ||||||
|         let file = tempfile::tempfile()?; |         let file = tempfile::tempfile()?; | ||||||
|         let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; |         let mut writer = | ||||||
|  |             create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; | ||||||
|  |  | ||||||
|         let mut obkv_buffer = Vec::new(); |         let mut obkv_buffer = Vec::new(); | ||||||
|         for result in self.index.documents.iter(self.rtxn)? { |         for result in self.index.documents.iter(self.rtxn)? { | ||||||
| @@ -561,20 +578,19 @@ fn compute_primary_key_pair( | |||||||
|                         return Err(UserError::MissingPrimaryKey.into()); |                         return Err(UserError::MissingPrimaryKey.into()); | ||||||
|                     } |                     } | ||||||
|                     DEFAULT_PRIMARY_KEY_NAME.to_string() |                     DEFAULT_PRIMARY_KEY_NAME.to_string() | ||||||
|                 }, |                 } | ||||||
|             }; |             }; | ||||||
|             let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; |             let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; | ||||||
|             Ok((id, name)) |             Ok((id, name)) | ||||||
|         }, |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn validate_document_id(document_id: &str) -> Option<&str> { | fn validate_document_id(document_id: &str) -> Option<&str> { | ||||||
|     let document_id = document_id.trim(); |     let document_id = document_id.trim(); | ||||||
|     Some(document_id).filter(|id| { |     Some(document_id).filter(|id| { | ||||||
|         !id.is_empty() && id.chars().all(|c| { |         !id.is_empty() | ||||||
|             matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_') |             && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) | ||||||
|         }) |  | ||||||
|     }) |     }) | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -583,8 +599,7 @@ mod test { | |||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     mod compute_primary_key { |     mod compute_primary_key { | ||||||
|         use super::compute_primary_key_pair; |         use super::{compute_primary_key_pair, FieldsIdsMap}; | ||||||
|         use super::FieldsIdsMap; |  | ||||||
|  |  | ||||||
|         #[test] |         #[test] | ||||||
|         fn should_return_primary_key_if_is_some() { |         fn should_return_primary_key_if_is_some() { | ||||||
| @@ -594,7 +609,8 @@ mod test { | |||||||
|                 Some("toto"), |                 Some("toto"), | ||||||
|                 &mut fields_map, |                 &mut fields_map, | ||||||
|                 Some("tata".to_string()), |                 Some("tata".to_string()), | ||||||
|                 false); |                 false, | ||||||
|  |             ); | ||||||
|             assert_eq!(result.unwrap(), (0u8, "toto".to_string())); |             assert_eq!(result.unwrap(), (0u8, "toto".to_string())); | ||||||
|             assert_eq!(fields_map.len(), 1); |             assert_eq!(fields_map.len(), 1); | ||||||
|         } |         } | ||||||
| @@ -602,11 +618,8 @@ mod test { | |||||||
|         #[test] |         #[test] | ||||||
|         fn should_return_alternative_if_primary_is_none() { |         fn should_return_alternative_if_primary_is_none() { | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |             let mut fields_map = FieldsIdsMap::new(); | ||||||
|             let result = compute_primary_key_pair( |             let result = | ||||||
|                 None, |                 compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); | ||||||
|                 &mut fields_map, |  | ||||||
|                 Some("tata".to_string()), |  | ||||||
|                 false); |  | ||||||
|             assert_eq!(result.unwrap(), (0u8, "tata".to_string())); |             assert_eq!(result.unwrap(), (0u8, "tata".to_string())); | ||||||
|             assert_eq!(fields_map.len(), 1); |             assert_eq!(fields_map.len(), 1); | ||||||
|         } |         } | ||||||
| @@ -614,23 +627,15 @@ mod test { | |||||||
|         #[test] |         #[test] | ||||||
|         fn should_return_default_if_both_are_none() { |         fn should_return_default_if_both_are_none() { | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |             let mut fields_map = FieldsIdsMap::new(); | ||||||
|             let result = compute_primary_key_pair( |             let result = compute_primary_key_pair(None, &mut fields_map, None, true); | ||||||
|                 None, |  | ||||||
|                 &mut fields_map, |  | ||||||
|                 None, |  | ||||||
|                 true); |  | ||||||
|             assert_eq!(result.unwrap(), (0u8, "id".to_string())); |             assert_eq!(result.unwrap(), (0u8, "id".to_string())); | ||||||
|             assert_eq!(fields_map.len(), 1); |             assert_eq!(fields_map.len(), 1); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         #[test] |         #[test] | ||||||
|         fn should_return_err_if_both_are_none_and_recompute_is_false(){ |         fn should_return_err_if_both_are_none_and_recompute_is_false() { | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |             let mut fields_map = FieldsIdsMap::new(); | ||||||
|             let result = compute_primary_key_pair( |             let result = compute_primary_key_pair(None, &mut fields_map, None, false); | ||||||
|                 None, |  | ||||||
|                 &mut fields_map, |  | ||||||
|                 None, |  | ||||||
|                 false); |  | ||||||
|             assert!(result.is_err()); |             assert!(result.is_err()); | ||||||
|             assert_eq!(fields_map.len(), 0); |             assert_eq!(fields_map.len(), 0); | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -2,7 +2,9 @@ pub use self::available_documents_ids::AvailableDocumentsIds; | |||||||
| pub use self::clear_documents::ClearDocuments; | pub use self::clear_documents::ClearDocuments; | ||||||
| pub use self::delete_documents::DeleteDocuments; | pub use self::delete_documents::DeleteDocuments; | ||||||
| pub use self::facets::Facets; | pub use self::facets::Facets; | ||||||
| pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat}; | pub use self::index_documents::{ | ||||||
|  |     DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat, | ||||||
|  | }; | ||||||
| pub use self::settings::{Setting, Settings}; | pub use self::settings::{Setting, Settings}; | ||||||
| pub use self::update_builder::UpdateBuilder; | pub use self::update_builder::UpdateBuilder; | ||||||
| pub use self::update_step::UpdateIndexingStep; | pub use self::update_step::UpdateIndexingStep; | ||||||
|   | |||||||
| @@ -34,17 +34,24 @@ impl<T> Setting<T> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<T: Serialize> Serialize for Setting<T> { | impl<T: Serialize> Serialize for Setting<T> { | ||||||
|     fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error> where S: Serializer { |     fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error> | ||||||
|  |     where | ||||||
|  |         S: Serializer, | ||||||
|  |     { | ||||||
|         match self { |         match self { | ||||||
|             Self::Set(value) => Some(value), |             Self::Set(value) => Some(value), | ||||||
|             // Usually not_set isn't serialized by setting skip_serializing_if field attribute |             // Usually not_set isn't serialized by setting skip_serializing_if field attribute | ||||||
|             Self::NotSet | Self::Reset => None, |             Self::NotSet | Self::Reset => None, | ||||||
|         }.serialize(serializer) |         } | ||||||
|  |         .serialize(serializer) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> { | impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> { | ||||||
|     fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error> where D: Deserializer<'de> { |     fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error> | ||||||
|  |     where | ||||||
|  |         D: Deserializer<'de>, | ||||||
|  |     { | ||||||
|         Deserialize::deserialize(deserializer).map(|x| match x { |         Deserialize::deserialize(deserializer).map(|x| match x { | ||||||
|             Some(x) => Self::Set(x), |             Some(x) => Self::Set(x), | ||||||
|             None => Self::Reset, // Reset is forced by sending null value |             None => Self::Reset, // Reset is forced by sending null value | ||||||
| @@ -141,11 +148,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) { |     pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) { | ||||||
|         self.stop_words = if stop_words.is_empty() { |         self.stop_words = | ||||||
|             Setting::Reset |             if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) } | ||||||
|         } else { |  | ||||||
|             Setting::Set(stop_words) |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn reset_distinct_field(&mut self) { |     pub fn reset_distinct_field(&mut self) { | ||||||
| @@ -161,11 +165,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) { |     pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) { | ||||||
|         self.synonyms = if synonyms.is_empty() { |         self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) } | ||||||
|             Setting::Reset |  | ||||||
|         } else { |  | ||||||
|             Setting::Set(synonyms) |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn reset_primary_key(&mut self) { |     pub fn reset_primary_key(&mut self) { | ||||||
| @@ -178,7 +178,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|  |  | ||||||
|     fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> |     fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> | ||||||
|     where |     where | ||||||
|         F: Fn(UpdateIndexingStep, u64) + Sync |         F: Fn(UpdateIndexingStep, u64) + Sync, | ||||||
|     { |     { | ||||||
|         let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; |         let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; | ||||||
|         let update_id = self.update_id; |         let update_id = self.update_id; | ||||||
| @@ -203,7 +203,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // There already has been a document addition, the primary key should be set by now. |         // There already has been a document addition, the primary key should be set by now. | ||||||
|         let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; |         let primary_key = | ||||||
|  |             self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; | ||||||
|  |  | ||||||
|         // We remap the documents fields based on the new `FieldsIdsMap`. |         // We remap the documents fields based on the new `FieldsIdsMap`. | ||||||
|         let output = transform.remap_index_documents( |         let output = transform.remap_index_documents( | ||||||
| @@ -236,21 +237,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|             Setting::Set(ref fields) => { |             Setting::Set(ref fields) => { | ||||||
|                 let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; |                 let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; | ||||||
|                 // fields are deduplicated, only the first occurrence is taken into account |                 // fields are deduplicated, only the first occurrence is taken into account | ||||||
|                 let names: Vec<_> = fields |                 let names: Vec<_> = fields.iter().unique().map(String::as_str).collect(); | ||||||
|                     .iter() |  | ||||||
|                     .unique() |  | ||||||
|                     .map(String::as_str) |  | ||||||
|                     .collect(); |  | ||||||
|  |  | ||||||
|                 for name in names.iter() { |                 for name in names.iter() { | ||||||
|                     fields_ids_map |                     fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; | ||||||
|                         .insert(name) |  | ||||||
|                         .ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|                 } |                 } | ||||||
|                 self.index.put_displayed_fields(self.wtxn, &names)?; |                 self.index.put_displayed_fields(self.wtxn, &names)?; | ||||||
|                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; |                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||||
|             } |             } | ||||||
|             Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; } |             Setting::Reset => { | ||||||
|  |                 self.index.delete_displayed_fields(self.wtxn)?; | ||||||
|  |             } | ||||||
|             Setting::NotSet => return Ok(false), |             Setting::NotSet => return Ok(false), | ||||||
|         } |         } | ||||||
|         Ok(true) |         Ok(true) | ||||||
| @@ -260,14 +257,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|         match self.distinct_field { |         match self.distinct_field { | ||||||
|             Setting::Set(ref attr) => { |             Setting::Set(ref attr) => { | ||||||
|                 let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; |                 let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; | ||||||
|                 fields_ids_map |                 fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?; | ||||||
|                     .insert(attr) |  | ||||||
|                     .ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|  |  | ||||||
|                 self.index.put_distinct_field(self.wtxn, &attr)?; |                 self.index.put_distinct_field(self.wtxn, &attr)?; | ||||||
|                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; |                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||||
|             } |             } | ||||||
|             Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; }, |             Setting::Reset => { | ||||||
|  |                 self.index.delete_distinct_field(self.wtxn)?; | ||||||
|  |             } | ||||||
|             Setting::NotSet => return Ok(false), |             Setting::NotSet => return Ok(false), | ||||||
|         } |         } | ||||||
|         Ok(true) |         Ok(true) | ||||||
| @@ -285,30 +282,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|  |  | ||||||
|                 let mut new_fields_ids_map = FieldsIdsMap::new(); |                 let mut new_fields_ids_map = FieldsIdsMap::new(); | ||||||
|                 // fields are deduplicated, only the first occurrence is taken into account |                 // fields are deduplicated, only the first occurrence is taken into account | ||||||
|                 let names = fields |                 let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>(); | ||||||
|                     .iter() |  | ||||||
|                     .unique() |  | ||||||
|                     .map(String::as_str) |  | ||||||
|                     .collect::<Vec<_>>(); |  | ||||||
|  |  | ||||||
|                 // Add all the searchable attributes to the field map, and then add the |                 // Add all the searchable attributes to the field map, and then add the | ||||||
|                 // remaining fields from the old field map to the new one |                 // remaining fields from the old field map to the new one | ||||||
|                 for name in names.iter() { |                 for name in names.iter() { | ||||||
|                     new_fields_ids_map |                     new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; | ||||||
|                         .insert(&name) |  | ||||||
|                         .ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 for (_, name) in old_fields_ids_map.iter() { |                 for (_, name) in old_fields_ids_map.iter() { | ||||||
|                     new_fields_ids_map |                     new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; | ||||||
|                         .insert(&name) |  | ||||||
|                         .ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 self.index.put_searchable_fields(self.wtxn, &names)?; |                 self.index.put_searchable_fields(self.wtxn, &names)?; | ||||||
|                 self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; |                 self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; | ||||||
|             } |             } | ||||||
|             Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; } |             Setting::Reset => { | ||||||
|  |                 self.index.delete_searchable_fields(self.wtxn)?; | ||||||
|  |             } | ||||||
|             Setting::NotSet => return Ok(false), |             Setting::NotSet => return Ok(false), | ||||||
|         } |         } | ||||||
|         Ok(true) |         Ok(true) | ||||||
| @@ -323,7 +314,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                 let fst = fst::Set::from_iter(stop_words)?; |                 let fst = fst::Set::from_iter(stop_words)?; | ||||||
|  |  | ||||||
|                 // Does the new FST differ from the previous one? |                 // Does the new FST differ from the previous one? | ||||||
|                 if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { |                 if current | ||||||
|  |                     .map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) | ||||||
|  |                 { | ||||||
|                     // we want to re-create our FST. |                     // we want to re-create our FST. | ||||||
|                     self.index.put_stop_words(self.wtxn, &fst)?; |                     self.index.put_stop_words(self.wtxn, &fst)?; | ||||||
|                     Ok(true) |                     Ok(true) | ||||||
| @@ -343,9 +336,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                     analyzer |                     analyzer | ||||||
|                         .analyze(text) |                         .analyze(text) | ||||||
|                         .tokens() |                         .tokens() | ||||||
|                         .filter_map(|token| |                         .filter_map(|token| { | ||||||
|                             if token.is_word() { Some(token.text().to_string()) } else { None } |                             if token.is_word() { | ||||||
|                         ) |                                 Some(token.text().to_string()) | ||||||
|  |                             } else { | ||||||
|  |                                 None | ||||||
|  |                             } | ||||||
|  |                         }) | ||||||
|                         .collect::<Vec<_>>() |                         .collect::<Vec<_>>() | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -360,25 +357,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                 for (word, synonyms) in synonyms { |                 for (word, synonyms) in synonyms { | ||||||
|                     // Normalize both the word and associated synonyms. |                     // Normalize both the word and associated synonyms. | ||||||
|                     let normalized_word = normalize(&analyzer, word); |                     let normalized_word = normalize(&analyzer, word); | ||||||
|                     let normalized_synonyms = synonyms |                     let normalized_synonyms = | ||||||
|                         .iter() |                         synonyms.iter().map(|synonym| normalize(&analyzer, synonym)); | ||||||
|                         .map(|synonym| normalize(&analyzer, synonym)); |  | ||||||
|  |  | ||||||
|                     // Store the normalized synonyms under the normalized word, |                     // Store the normalized synonyms under the normalized word, | ||||||
|                     // merging the possible duplicate words. |                     // merging the possible duplicate words. | ||||||
|                     let entry = new_synonyms |                     let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new); | ||||||
|                         .entry(normalized_word) |  | ||||||
|                         .or_insert_with(Vec::new); |  | ||||||
|                     entry.extend(normalized_synonyms); |                     entry.extend(normalized_synonyms); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 // Make sure that we don't have duplicate synonyms. |                 // Make sure that we don't have duplicate synonyms. | ||||||
|                 new_synonyms |                 new_synonyms.iter_mut().for_each(|(_, synonyms)| { | ||||||
|                     .iter_mut() |                     synonyms.sort_unstable(); | ||||||
|                     .for_each(|(_, synonyms)| { |                     synonyms.dedup(); | ||||||
|                         synonyms.sort_unstable(); |                 }); | ||||||
|                         synonyms.dedup(); |  | ||||||
|                     }); |  | ||||||
|  |  | ||||||
|                 let old_synonyms = self.index.synonyms(self.wtxn)?; |                 let old_synonyms = self.index.synonyms(self.wtxn)?; | ||||||
|  |  | ||||||
| @@ -406,7 +398,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                 self.index.put_filterable_fields(self.wtxn, &new_facets)?; |                 self.index.put_filterable_fields(self.wtxn, &new_facets)?; | ||||||
|                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; |                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||||
|             } |             } | ||||||
|             Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; } |             Setting::Reset => { | ||||||
|  |                 self.index.delete_filterable_fields(self.wtxn)?; | ||||||
|  |             } | ||||||
|             Setting::NotSet => (), |             Setting::NotSet => (), | ||||||
|         } |         } | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -427,7 +421,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                 self.index.put_criteria(self.wtxn, &new_criteria)?; |                 self.index.put_criteria(self.wtxn, &new_criteria)?; | ||||||
|                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; |                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||||
|             } |             } | ||||||
|             Setting::Reset => { self.index.delete_criteria(self.wtxn)?; } |             Setting::Reset => { | ||||||
|  |                 self.index.delete_criteria(self.wtxn)?; | ||||||
|  |             } | ||||||
|             Setting::NotSet => (), |             Setting::NotSet => (), | ||||||
|         } |         } | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -445,7 +441,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                 } else { |                 } else { | ||||||
|                     Err(UserError::PrimaryKeyCannotBeChanged.into()) |                     Err(UserError::PrimaryKeyCannotBeChanged.into()) | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             Setting::Reset => { |             Setting::Reset => { | ||||||
|                 if self.index.number_of_documents(&self.wtxn)? == 0 { |                 if self.index.number_of_documents(&self.wtxn)? == 0 { | ||||||
|                     self.index.delete_primary_key(self.wtxn)?; |                     self.index.delete_primary_key(self.wtxn)?; | ||||||
| @@ -453,14 +449,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                 } else { |                 } else { | ||||||
|                     Err(UserError::PrimaryKeyCannotBeReset.into()) |                     Err(UserError::PrimaryKeyCannotBeReset.into()) | ||||||
|                 } |                 } | ||||||
|             }, |             } | ||||||
|             Setting::NotSet => Ok(()), |             Setting::NotSet => Ok(()), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn execute<F>(mut self, progress_callback: F) -> Result<()> |     pub fn execute<F>(mut self, progress_callback: F) -> Result<()> | ||||||
|         where |     where | ||||||
|             F: Fn(UpdateIndexingStep, u64) + Sync |         F: Fn(UpdateIndexingStep, u64) + Sync, | ||||||
|     { |     { | ||||||
|         self.index.set_updated_at(self.wtxn, &Utc::now())?; |         self.index.set_updated_at(self.wtxn, &Utc::now())?; | ||||||
|  |  | ||||||
| @@ -493,17 +489,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use heed::EnvOpenOptions; |  | ||||||
|     use heed::types::ByteSlice; |  | ||||||
|     use maplit::{btreeset, hashmap, hashset}; |  | ||||||
|     use big_s::S; |     use big_s::S; | ||||||
|  |     use heed::types::ByteSlice; | ||||||
|  |     use heed::EnvOpenOptions; | ||||||
|  |     use maplit::{btreeset, hashmap, hashset}; | ||||||
|  |  | ||||||
|  |     use super::*; | ||||||
|     use crate::error::Error; |     use crate::error::Error; | ||||||
|     use crate::update::{IndexDocuments, UpdateFormat}; |     use crate::update::{IndexDocuments, UpdateFormat}; | ||||||
|     use crate::{Criterion, FilterCondition, SearchResult}; |     use crate::{Criterion, FilterCondition, SearchResult}; | ||||||
|  |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn set_and_reset_searchable_fields() { |     fn set_and_reset_searchable_fields() { | ||||||
|         let path = tempfile::tempdir().unwrap(); |         let path = tempfile::tempdir().unwrap(); | ||||||
| @@ -674,7 +669,7 @@ mod tests { | |||||||
|         // Set the filterable fields to be the age. |         // Set the filterable fields to be the age. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = Settings::new(&mut wtxn, &index, 0); |         let mut builder = Settings::new(&mut wtxn, &index, 0); | ||||||
|         builder.set_filterable_fields(hashset!{ S("age") }); |         builder.set_filterable_fields(hashset! { S("age") }); | ||||||
|         builder.execute(|_, _| ()).unwrap(); |         builder.execute(|_, _| ()).unwrap(); | ||||||
|  |  | ||||||
|         // Then index some documents. |         // Then index some documents. | ||||||
| @@ -692,12 +687,15 @@ mod tests { | |||||||
|         // Check that the displayed fields are correctly set. |         // Check that the displayed fields are correctly set. | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|         let fields_ids = index.filterable_fields(&rtxn).unwrap(); |         let fields_ids = index.filterable_fields(&rtxn).unwrap(); | ||||||
|         assert_eq!(fields_ids, hashset!{ S("age") }); |         assert_eq!(fields_ids, hashset! { S("age") }); | ||||||
|         // Only count the field_id 0 and level 0 facet values. |         // Only count the field_id 0 and level 0 facet values. | ||||||
|         // TODO we must support typed CSVs for numbers to be understood. |         // TODO we must support typed CSVs for numbers to be understood. | ||||||
|         let count = index.facet_id_f64_docids |         let count = index | ||||||
|  |             .facet_id_f64_docids | ||||||
|             .remap_key_type::<ByteSlice>() |             .remap_key_type::<ByteSlice>() | ||||||
|             .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); |             .prefix_iter(&rtxn, &[0, 0]) | ||||||
|  |             .unwrap() | ||||||
|  |             .count(); | ||||||
|         assert_eq!(count, 3); |         assert_eq!(count, 3); | ||||||
|         drop(rtxn); |         drop(rtxn); | ||||||
|  |  | ||||||
| @@ -718,9 +716,12 @@ mod tests { | |||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|         // Only count the field_id 0 and level 0 facet values. |         // Only count the field_id 0 and level 0 facet values. | ||||||
|         // TODO we must support typed CSVs for numbers to be understood. |         // TODO we must support typed CSVs for numbers to be understood. | ||||||
|         let count = index.facet_id_f64_docids |         let count = index | ||||||
|  |             .facet_id_f64_docids | ||||||
|             .remap_key_type::<ByteSlice>() |             .remap_key_type::<ByteSlice>() | ||||||
|             .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); |             .prefix_iter(&rtxn, &[0, 0]) | ||||||
|  |             .unwrap() | ||||||
|  |             .count(); | ||||||
|         assert_eq!(count, 4); |         assert_eq!(count, 4); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -969,7 +970,7 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = Settings::new(&mut wtxn, &index, 0); |         let mut builder = Settings::new(&mut wtxn, &index, 0); | ||||||
|         builder.set_displayed_fields(vec!["hello".to_string()]); |         builder.set_displayed_fields(vec!["hello".to_string()]); | ||||||
|         builder.set_filterable_fields(hashset!{ S("age"), S("toto") }); |         builder.set_filterable_fields(hashset! { S("age"), S("toto") }); | ||||||
|         builder.set_criteria(vec!["asc(toto)".to_string()]); |         builder.set_criteria(vec!["asc(toto)".to_string()]); | ||||||
|         builder.execute(|_, _| ()).unwrap(); |         builder.execute(|_, _| ()).unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -1,8 +1,8 @@ | |||||||
| use grenad::CompressionType; | use grenad::CompressionType; | ||||||
| use rayon::ThreadPool; | use rayon::ThreadPool; | ||||||
|  |  | ||||||
|  | use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings}; | ||||||
| use crate::{Index, Result}; | use crate::{Index, Result}; | ||||||
| use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; |  | ||||||
|  |  | ||||||
| pub struct UpdateBuilder<'a> { | pub struct UpdateBuilder<'a> { | ||||||
|     pub(crate) log_every_n: Option<usize>, |     pub(crate) log_every_n: Option<usize>, | ||||||
| @@ -67,8 +67,7 @@ impl<'a> UpdateBuilder<'a> { | |||||||
|         self, |         self, | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|     ) -> ClearDocuments<'t, 'u, 'i> |     ) -> ClearDocuments<'t, 'u, 'i> { | ||||||
|     { |  | ||||||
|         ClearDocuments::new(wtxn, index, self.update_id) |         ClearDocuments::new(wtxn, index, self.update_id) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -76,8 +75,7 @@ impl<'a> UpdateBuilder<'a> { | |||||||
|         self, |         self, | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|     ) -> Result<DeleteDocuments<'t, 'u, 'i>> |     ) -> Result<DeleteDocuments<'t, 'u, 'i>> { | ||||||
|     { |  | ||||||
|         DeleteDocuments::new(wtxn, index, self.update_id) |         DeleteDocuments::new(wtxn, index, self.update_id) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -85,8 +83,7 @@ impl<'a> UpdateBuilder<'a> { | |||||||
|         self, |         self, | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|     ) -> IndexDocuments<'t, 'u, 'i, 'a> |     ) -> IndexDocuments<'t, 'u, 'i, 'a> { | ||||||
|     { |  | ||||||
|         let mut builder = IndexDocuments::new(wtxn, index, self.update_id); |         let mut builder = IndexDocuments::new(wtxn, index, self.update_id); | ||||||
|  |  | ||||||
|         builder.log_every_n = self.log_every_n; |         builder.log_every_n = self.log_every_n; | ||||||
| @@ -105,8 +102,7 @@ impl<'a> UpdateBuilder<'a> { | |||||||
|         self, |         self, | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|     ) -> Settings<'a, 't, 'u, 'i> |     ) -> Settings<'a, 't, 'u, 'i> { | ||||||
|     { |  | ||||||
|         let mut builder = Settings::new(wtxn, index, self.update_id); |         let mut builder = Settings::new(wtxn, index, self.update_id); | ||||||
|  |  | ||||||
|         builder.log_every_n = self.log_every_n; |         builder.log_every_n = self.log_every_n; | ||||||
| @@ -125,8 +121,7 @@ impl<'a> UpdateBuilder<'a> { | |||||||
|         self, |         self, | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|     ) -> Facets<'t, 'u, 'i> |     ) -> Facets<'t, 'u, 'i> { | ||||||
|     { |  | ||||||
|         let mut builder = Facets::new(wtxn, index, self.update_id); |         let mut builder = Facets::new(wtxn, index, self.update_id); | ||||||
|  |  | ||||||
|         builder.chunk_compression_type = self.chunk_compression_type; |         builder.chunk_compression_type = self.chunk_compression_type; | ||||||
|   | |||||||
| @@ -1,15 +1,13 @@ | |||||||
| use std::str; | use std::str; | ||||||
|  |  | ||||||
| use crate::Index; |  | ||||||
| use fst::Streamer; | use fst::Streamer; | ||||||
| use grenad::CompressionType; | use grenad::CompressionType; | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
|  |  | ||||||
| use crate::Result; |  | ||||||
| use crate::update::index_documents::WriteMethod; |  | ||||||
| use crate::update::index_documents::{ | use crate::update::index_documents::{ | ||||||
|     create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, |     create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod, | ||||||
| }; | }; | ||||||
|  | use crate::{Index, Result}; | ||||||
|  |  | ||||||
| pub struct WordPrefixDocids<'t, 'u, 'i> { | pub struct WordPrefixDocids<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -22,7 +20,10 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | ||||||
|     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { |     pub fn new( | ||||||
|  |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|  |         index: &'i Index, | ||||||
|  |     ) -> WordPrefixDocids<'t, 'u, 'i> { | ||||||
|         WordPrefixDocids { |         WordPrefixDocids { | ||||||
|             wtxn, |             wtxn, | ||||||
|             index, |             index, | ||||||
|   | |||||||
| @@ -1,18 +1,17 @@ | |||||||
| use std::str; | use std::str; | ||||||
|  |  | ||||||
| use fst::automaton::{Automaton, Str}; | use fst::automaton::{Automaton, Str}; | ||||||
| use fst::{Streamer, IntoStreamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| use grenad::CompressionType; | use grenad::CompressionType; | ||||||
| use heed::BytesEncode; |  | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
|  | use heed::BytesEncode; | ||||||
| use log::debug; | use log::debug; | ||||||
|  |  | ||||||
| use crate::{Index, Result}; |  | ||||||
| use crate::heed_codec::StrStrU8Codec; | use crate::heed_codec::StrStrU8Codec; | ||||||
| use crate::update::index_documents::{ | use crate::update::index_documents::{ | ||||||
|     WriteMethod, create_sorter, sorter_into_lmdb_database, |     cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod, | ||||||
|     cbo_roaring_bitmap_merge, |  | ||||||
| }; | }; | ||||||
|  | use crate::{Index, Result}; | ||||||
|  |  | ||||||
| pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { | pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -28,8 +27,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { | |||||||
|     pub fn new( |     pub fn new( | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|     ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> |     ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||||
|     { |  | ||||||
|         WordPrefixPairProximityDocids { |         WordPrefixPairProximityDocids { | ||||||
|             wtxn, |             wtxn, | ||||||
|             index, |             index, | ||||||
|   | |||||||
| @@ -1,25 +1,23 @@ | |||||||
| use std::{cmp, str}; |  | ||||||
| use std::convert::TryFrom; | use std::convert::TryFrom; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::num::NonZeroU32; | use std::num::NonZeroU32; | ||||||
|  | use std::{cmp, str}; | ||||||
|  |  | ||||||
| use fst::automaton::{self, Automaton}; | use fst::automaton::{self, Automaton}; | ||||||
| use fst::{Streamer, IntoStreamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| use grenad::{CompressionType, Reader, Writer, FileFuse}; | use grenad::{CompressionType, FileFuse, Reader, Writer}; | ||||||
| use heed::types::{ByteSlice, DecodeIgnore, Str}; | use heed::types::{ByteSlice, DecodeIgnore, Str}; | ||||||
| use heed::{BytesEncode, Error}; | use heed::{BytesEncode, Error}; | ||||||
| use log::debug; | use log::debug; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::error::InternalError; | use crate::error::InternalError; | ||||||
| use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; | use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; | ||||||
| use crate::Result; |  | ||||||
| use crate::update::index_documents::WriteMethod; |  | ||||||
| use crate::update::index_documents::{ | use crate::update::index_documents::{ | ||||||
|     create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, |     cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, | ||||||
|     cbo_roaring_bitmap_merge, sorter_into_lmdb_database |     write_into_lmdb_database, writer_into_reader, WriteMethod, | ||||||
| }; | }; | ||||||
| use crate::{Index, TreeLevel}; | use crate::{Index, Result, TreeLevel}; | ||||||
|  |  | ||||||
| pub struct WordsLevelPositions<'t, 'u, 'i> { | pub struct WordsLevelPositions<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -34,7 +32,10 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||||
|     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { |     pub fn new( | ||||||
|  |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|  |         index: &'i Index, | ||||||
|  |     ) -> WordsLevelPositions<'t, 'u, 'i> { | ||||||
|         WordsLevelPositions { |         WordsLevelPositions { | ||||||
|             wtxn, |             wtxn, | ||||||
|             index, |             index, | ||||||
| @@ -144,7 +145,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | |||||||
|             self.wtxn, |             self.wtxn, | ||||||
|             *self.index.word_prefix_level_position_docids.as_polymorph(), |             *self.index.word_prefix_level_position_docids.as_polymorph(), | ||||||
|             entries, |             entries, | ||||||
|             |_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }), |             |_, _| { | ||||||
|  |                 Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }) | ||||||
|  |             }, | ||||||
|             WriteMethod::Append, |             WriteMethod::Append, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
| @@ -176,13 +179,11 @@ fn compute_positions_levels( | |||||||
|     shrink_size: Option<u64>, |     shrink_size: Option<u64>, | ||||||
|     level_group_size: NonZeroU32, |     level_group_size: NonZeroU32, | ||||||
|     min_level_size: NonZeroU32, |     min_level_size: NonZeroU32, | ||||||
| ) -> Result<Reader<FileFuse>> | ) -> Result<Reader<FileFuse>> { | ||||||
| { |  | ||||||
|     // It is forbidden to keep a cursor and write in a database at the same time with LMDB |     // It is forbidden to keep a cursor and write in a database at the same time with LMDB | ||||||
|     // therefore we write the facet levels entries into a grenad file before transfering them. |     // therefore we write the facet levels entries into a grenad file before transfering them. | ||||||
|     let mut writer = tempfile::tempfile().and_then(|file| { |     let mut writer = tempfile::tempfile() | ||||||
|         create_writer(compression_type, compression_level, file) |         .and_then(|file| create_writer(compression_type, compression_level, file))?; | ||||||
|     })?; |  | ||||||
|  |  | ||||||
|     for result in words_db.iter(rtxn)? { |     for result in words_db.iter(rtxn)? { | ||||||
|         let (word, ()) = result?; |         let (word, ()) = result?; | ||||||
| @@ -193,7 +194,8 @@ fn compute_positions_levels( | |||||||
|             left..=right |             left..=right | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>() |         let first_level_size = words_positions_db | ||||||
|  |             .remap_data_type::<DecodeIgnore>() | ||||||
|             .range(rtxn, &level_0_range)? |             .range(rtxn, &level_0_range)? | ||||||
|             .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; |             .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; | ||||||
|  |  | ||||||
| @@ -253,8 +255,7 @@ fn write_level_entry( | |||||||
|     left: u32, |     left: u32, | ||||||
|     right: u32, |     right: u32, | ||||||
|     ids: &RoaringBitmap, |     ids: &RoaringBitmap, | ||||||
| ) -> Result<()> | ) -> Result<()> { | ||||||
| { |  | ||||||
|     let key = (word, level, left, right); |     let key = (word, level, left, right); | ||||||
|     let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; |     let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||||
|     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; |     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; | ||||||
|   | |||||||
| @@ -2,7 +2,8 @@ use std::iter::FromIterator; | |||||||
| use std::str; | use std::str; | ||||||
|  |  | ||||||
| use fst::Streamer; | use fst::Streamer; | ||||||
| use crate::{Index, SmallString32, Result}; |  | ||||||
|  | use crate::{Index, Result, SmallString32}; | ||||||
|  |  | ||||||
| pub struct WordsPrefixesFst<'t, 'u, 'i> { | pub struct WordsPrefixesFst<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -17,8 +18,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { | |||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|         update_id: u64, |         update_id: u64, | ||||||
|     ) -> WordsPrefixesFst<'t, 'u, 'i> |     ) -> WordsPrefixesFst<'t, 'u, 'i> { | ||||||
|     { |  | ||||||
|         WordsPrefixesFst { |         WordsPrefixesFst { | ||||||
|             wtxn, |             wtxn, | ||||||
|             index, |             index, | ||||||
| @@ -55,7 +55,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { | |||||||
|  |  | ||||||
|         let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); |         let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); | ||||||
|         for n in 1..=self.max_prefix_length { |         for n in 1..=self.max_prefix_length { | ||||||
|  |  | ||||||
|             let mut current_prefix = SmallString32::new(); |             let mut current_prefix = SmallString32::new(); | ||||||
|             let mut current_prefix_count = 0; |             let mut current_prefix_count = 0; | ||||||
|             let mut builder = fst::SetBuilder::memory(); |             let mut builder = fst::SetBuilder::memory(); | ||||||
|   | |||||||
| @@ -1,9 +1,8 @@ | |||||||
| use milli::{Criterion, Index, DocumentId}; |  | ||||||
| use milli::update::{IndexDocuments, UpdateFormat, Settings}; |  | ||||||
|  |  | ||||||
| use big_s::S; | use big_s::S; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use maplit::{hashmap, hashset}; | use maplit::{hashmap, hashset}; | ||||||
|  | use milli::update::{IndexDocuments, Settings, UpdateFormat}; | ||||||
|  | use milli::{Criterion, DocumentId, Index}; | ||||||
| use serde::Deserialize; | use serde::Deserialize; | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
| @@ -11,7 +10,8 @@ mod query_criteria; | |||||||
|  |  | ||||||
| pub const TEST_QUERY: &'static str = "hello world america"; | pub const TEST_QUERY: &'static str = "hello world america"; | ||||||
|  |  | ||||||
| pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; | pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = | ||||||
|  |     &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; | ||||||
|  |  | ||||||
| pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); | pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); | ||||||
|  |  | ||||||
| @@ -27,16 +27,16 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | |||||||
|  |  | ||||||
|     let criteria = criteria.iter().map(|c| c.to_string()).collect(); |     let criteria = criteria.iter().map(|c| c.to_string()).collect(); | ||||||
|     builder.set_criteria(criteria); |     builder.set_criteria(criteria); | ||||||
|     builder.set_filterable_fields(hashset!{ |     builder.set_filterable_fields(hashset! { | ||||||
|         S("tag"), |         S("tag"), | ||||||
|         S("asc_desc_rank"), |         S("asc_desc_rank"), | ||||||
|     }); |     }); | ||||||
|     builder.set_synonyms(hashmap!{ |     builder.set_synonyms(hashmap! { | ||||||
|         S("hello") => vec![S("good morning")], |         S("hello") => vec![S("good morning")], | ||||||
|         S("world") => vec![S("earth")], |         S("world") => vec![S("earth")], | ||||||
|         S("america") => vec![S("the united states")], |         S("america") => vec![S("the united states")], | ||||||
|     }); |     }); | ||||||
|     builder.set_searchable_fields(vec![S("title"),S("description")]); |     builder.set_searchable_fields(vec![S("title"), S("description")]); | ||||||
|     builder.execute(|_, _| ()).unwrap(); |     builder.execute(|_, _| ()).unwrap(); | ||||||
|  |  | ||||||
|     // index documents |     // index documents | ||||||
| @@ -53,12 +53,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | |||||||
| pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> { | pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> { | ||||||
|     let mut rtxn = index.read_txn().unwrap(); |     let mut rtxn = index.read_txn().unwrap(); | ||||||
|     let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); |     let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); | ||||||
|     let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); |     let docid_map: std::collections::HashMap<_, _> = | ||||||
|  |         EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); | ||||||
|     internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() |     internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec<TestDocument> { | pub fn expected_order( | ||||||
|     let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); |     criteria: &[Criterion], | ||||||
|  |     authorize_typo: bool, | ||||||
|  |     optional_words: bool, | ||||||
|  | ) -> Vec<TestDocument> { | ||||||
|  |     let dataset = | ||||||
|  |         serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); | ||||||
|     let mut groups: Vec<Vec<TestDocument>> = vec![dataset]; |     let mut groups: Vec<Vec<TestDocument>> = vec![dataset]; | ||||||
|  |  | ||||||
|     for criterion in criteria { |     for criterion in criteria { | ||||||
| @@ -67,32 +73,36 @@ pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_wor | |||||||
|             match criterion { |             match criterion { | ||||||
|                 Criterion::Attribute => { |                 Criterion::Attribute => { | ||||||
|                     group.sort_by_key(|d| d.attribute_rank); |                     group.sort_by_key(|d| d.attribute_rank); | ||||||
|                     new_groups.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); |                     new_groups | ||||||
|                 }, |                         .extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); | ||||||
|  |                 } | ||||||
|                 Criterion::Exactness => { |                 Criterion::Exactness => { | ||||||
|                     group.sort_by_key(|d| d.exact_rank); |                     group.sort_by_key(|d| d.exact_rank); | ||||||
|                     new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from)); |                     new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from)); | ||||||
|                 }, |                 } | ||||||
|                 Criterion::Proximity => { |                 Criterion::Proximity => { | ||||||
|                     group.sort_by_key(|d| d.proximity_rank); |                     group.sort_by_key(|d| d.proximity_rank); | ||||||
|                     new_groups.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); |                     new_groups | ||||||
|                 }, |                         .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); | ||||||
|  |                 } | ||||||
|                 Criterion::Typo => { |                 Criterion::Typo => { | ||||||
|                     group.sort_by_key(|d| d.typo_rank); |                     group.sort_by_key(|d| d.typo_rank); | ||||||
|                     new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); |                     new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); | ||||||
|                 }, |                 } | ||||||
|                 Criterion::Words => { |                 Criterion::Words => { | ||||||
|                     group.sort_by_key(|d| d.word_rank); |                     group.sort_by_key(|d| d.word_rank); | ||||||
|                     new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); |                     new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); | ||||||
|                 }, |                 } | ||||||
|                 Criterion::Asc(field_name) if field_name == "asc_desc_rank" => { |                 Criterion::Asc(field_name) if field_name == "asc_desc_rank" => { | ||||||
|                     group.sort_by_key(|d| d.asc_desc_rank); |                     group.sort_by_key(|d| d.asc_desc_rank); | ||||||
|                     new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); |                     new_groups | ||||||
|                 }, |                         .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); | ||||||
|                 Criterion::Desc(field_name)  if field_name == "asc_desc_rank" => { |                 } | ||||||
|  |                 Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { | ||||||
|                     group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); |                     group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); | ||||||
|                     new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); |                     new_groups | ||||||
|                 }, |                         .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); | ||||||
|  |                 } | ||||||
|                 Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), |                 Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -1,9 +1,9 @@ | |||||||
| use big_s::S; | use big_s::S; | ||||||
| use milli::update::Settings; | use milli::update::Settings; | ||||||
| use milli::{Search, SearchResult, Criterion}; | use milli::{Criterion, Search, SearchResult}; | ||||||
|  | use Criterion::*; | ||||||
|  |  | ||||||
| use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; | use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; | ||||||
| use Criterion::*; |  | ||||||
|  |  | ||||||
| const ALLOW_TYPOS: bool = true; | const ALLOW_TYPOS: bool = true; | ||||||
| const DISALLOW_TYPOS: bool = false; | const DISALLOW_TYPOS: bool = false; | ||||||
| @@ -35,29 +35,54 @@ macro_rules! test_criterion { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(none_allow_typo,                     ALLOW_OPTIONAL_WORDS,      ALLOW_TYPOS); | test_criterion!(none_allow_typo,                     ALLOW_OPTIONAL_WORDS,      ALLOW_TYPOS); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(none_disallow_typo,                  DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS); | test_criterion!(none_disallow_typo,                  DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(words_allow_typo,                    ALLOW_OPTIONAL_WORDS,      ALLOW_TYPOS,    Words); | test_criterion!(words_allow_typo,                    ALLOW_OPTIONAL_WORDS,      ALLOW_TYPOS,    Words); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(attribute_allow_typo,                DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Attribute); | test_criterion!(attribute_allow_typo,                DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Attribute); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(attribute_disallow_typo,             DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Attribute); | test_criterion!(attribute_disallow_typo,             DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Attribute); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(exactness_allow_typo,                DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Exactness); | test_criterion!(exactness_allow_typo,                DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Exactness); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(exactness_disallow_typo,             DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Exactness); | test_criterion!(exactness_disallow_typo,             DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Exactness); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(proximity_allow_typo,                DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Proximity); | test_criterion!(proximity_allow_typo,                DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Proximity); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(proximity_disallow_typo,             DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Proximity); | test_criterion!(proximity_disallow_typo,             DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Proximity); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(asc_allow_typo,                      DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Asc(S("asc_desc_rank"))); | test_criterion!(asc_allow_typo,                      DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Asc(S("asc_desc_rank"))); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(asc_disallow_typo,                   DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); | test_criterion!(asc_disallow_typo,                   DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(desc_allow_typo,                     DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Desc(S("asc_desc_rank"))); | test_criterion!(desc_allow_typo,                     DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Desc(S("asc_desc_rank"))); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(desc_disallow_typo,                  DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); | test_criterion!(desc_disallow_typo,                  DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(asc_unexisting_field_allow_typo,     DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Asc(S("unexisting_field"))); | test_criterion!(asc_unexisting_field_allow_typo,     DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Asc(S("unexisting_field"))); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(asc_unexisting_field_disallow_typo,  DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Asc(S("unexisting_field"))); | test_criterion!(asc_unexisting_field_disallow_typo,  DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Asc(S("unexisting_field"))); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(desc_unexisting_field_allow_typo,    DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Desc(S("unexisting_field"))); | test_criterion!(desc_unexisting_field_allow_typo,    DISALLOW_OPTIONAL_WORDS,   ALLOW_TYPOS,    Desc(S("unexisting_field"))); | ||||||
|  | #[rustfmt::skip] | ||||||
| test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Desc(S("unexisting_field"))); | test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS,   DISALLOW_TYPOS, Desc(S("unexisting_field"))); | ||||||
|  |  | ||||||
| #[test] | #[test] | ||||||
| fn criteria_mixup() { | fn criteria_mixup() { | ||||||
|     use Criterion::*; |     use Criterion::*; | ||||||
|     let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]); |     let index = search::setup_search_index_with_criteria(&vec![ | ||||||
|  |         Words, | ||||||
|  |         Attribute, | ||||||
|  |         Desc(S("asc_desc_rank")), | ||||||
|  |         Exactness, | ||||||
|  |         Proximity, | ||||||
|  |         Typo, | ||||||
|  |     ]); | ||||||
|  |  | ||||||
|  |     #[rustfmt::skip] | ||||||
|     let criteria_mix = { |     let criteria_mix = { | ||||||
|         // Criterion doesn't implement Copy, we create a new Criterion using a closure |         // Criterion doesn't implement Copy, we create a new Criterion using a closure | ||||||
|         let desc = || Desc(S("asc_desc_rank")); |         let desc = || Desc(S("asc_desc_rank")); | ||||||
| @@ -205,10 +230,11 @@ fn criteria_mixup() { | |||||||
|  |  | ||||||
|         let SearchResult { documents_ids, .. } = search.execute().unwrap(); |         let SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |  | ||||||
|         let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) |         let expected_external_ids: Vec<_> = | ||||||
|             .into_iter() |             search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) | ||||||
|             .map(|d| d.id) |                 .into_iter() | ||||||
|             .collect(); |                 .map(|d| d.id) | ||||||
|  |                 .collect(); | ||||||
|         let documents_ids = search::internal_to_external_ids(&index, &documents_ids); |         let documents_ids = search::internal_to_external_ids(&index, &documents_ids); | ||||||
|  |  | ||||||
|         assert_eq!(documents_ids, expected_external_ids); |         assert_eq!(documents_ids, expected_external_ids); | ||||||
|   | |||||||
							
								
								
									
										36
									
								
								script/pre-commit
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										36
									
								
								script/pre-commit
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,36 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  |  | ||||||
|  | cargo check --workspace --all-targets &>/dev/null | ||||||
|  | result=$? | ||||||
|  |  | ||||||
|  | if [[ ${result} -ne 0 ]] ; then | ||||||
|  | 	cat <<\EOF | ||||||
|  | The project does not compile. You might want to fix your error before commiting. | ||||||
|  |  | ||||||
|  | If you still want to commit you can do it by appending | ||||||
|  | --no-verify | ||||||
|  | at the end of your previous command. | ||||||
|  |  | ||||||
|  | If you are running a variant of bash you can directly paste this command in your terminal: | ||||||
|  | !! --no-verify | ||||||
|  | EOF | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | cargo fmt --all -- --check &>/dev/null | ||||||
|  | result=$? | ||||||
|  |  | ||||||
|  | if [[ ${result} -ne 0 ]] ; then | ||||||
|  | 	cat <<\EOF | ||||||
|  | The project is badly formatted. Please run: | ||||||
|  | cargo fmt --all | ||||||
|  |  | ||||||
|  | If you want to create your commit without propper formatting you can add | ||||||
|  | --no-verify | ||||||
|  | at the end of your commit. | ||||||
|  |  | ||||||
|  | If you are running a variant of bash you can directly paste this command in your terminal: | ||||||
|  | !! --no-verify | ||||||
|  | EOF | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
| @@ -6,10 +6,9 @@ use std::time::Instant; | |||||||
| use byte_unit::Byte; | use byte_unit::Byte; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use log::debug; | use log::debug; | ||||||
|  | use milli::{obkv_to_json, Index}; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
|  |  | ||||||
| use milli::{Index, obkv_to_json}; |  | ||||||
|  |  | ||||||
| #[cfg(target_os = "linux")] | #[cfg(target_os = "linux")] | ||||||
| #[global_allocator] | #[global_allocator] | ||||||
| static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; | ||||||
| @@ -86,7 +85,8 @@ fn main() -> anyhow::Result<()> { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         if opt.print_facet_distribution { |         if opt.print_facet_distribution { | ||||||
|             let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; |             let facets = | ||||||
|  |                 index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; | ||||||
|             serde_json::to_writer(&mut stdout, &facets)?; |             serde_json::to_writer(&mut stdout, &facets)?; | ||||||
|             let _ = writeln!(&mut stdout); |             let _ = writeln!(&mut stdout); | ||||||
|         } |         } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user