Compare commits

..

10 Commits

Author SHA1 Message Date
fa27bf3513 fix clippy 2023-11-29 15:04:41 +01:00
09ebf7428b stream and chunk the data 2023-11-29 14:27:50 +01:00
76e0248cdb gzip the tasks 2023-11-29 13:09:04 +01:00
7f0abaf582 parse the url correctly 2023-11-28 16:28:11 +01:00
6f135457f8 update and fix the test 2023-11-28 15:55:48 +01:00
41f3a30b0b return a task view instead of a task 2023-11-28 15:08:13 +01:00
9090e0fe9d add a first working test with actixweb 2023-11-28 14:47:07 +01:00
b3098b9d9a start writing a test with actix but it doesn't works 2023-11-28 14:01:44 +01:00
9b1de777de Implement the webhook 2023-11-28 11:40:09 +01:00
00f0711207 add the option 2023-11-27 15:22:44 +01:00
135 changed files with 1793 additions and 7965 deletions

1210
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -19,7 +19,7 @@ members = [
]
[workspace.package]
version = "1.6.0"
version = "1.5.0"
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
description = "Meilisearch HTTP server"
homepage = "https://meilisearch.com"

View File

@ -1,5 +1,5 @@
# Compile
FROM rust:1.71.1-alpine3.18 AS compiler
FROM rust:alpine3.16 AS compiler
RUN apk add -q --update-cache --no-cache build-base openssl-dev

View File

@ -36,7 +36,7 @@ fn setup_index() -> Index {
}
fn setup_settings<'t>(
wtxn: &mut RwTxn<'t>,
wtxn: &mut RwTxn<'t, '_>,
index: &'t Index,
primary_key: &str,
searchable_fields: &[&str],

View File

@ -129,6 +129,3 @@ experimental_enable_metrics = false
# Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
experimental_reduce_indexing_memory_usage = false
# Experimentally reduces the maximum number of tasks that will be processed at once, see: <https://github.com/orgs/meilisearch/discussions/713>
# experimental_max_number_of_batched_tasks = 100

View File

@ -267,7 +267,6 @@ pub(crate) mod test {
dictionary: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
proximity_precision: Setting::NotSet,
typo_tolerance: Setting::NotSet,
faceting: Setting::Set(FacetingSettings {
max_values_per_facet: Setting::Set(111),
@ -276,7 +275,6 @@ pub(crate) mod test {
),
}),
pagination: Setting::NotSet,
embedders: Setting::NotSet,
_kind: std::marker::PhantomData,
};
settings.check()

View File

@ -345,7 +345,6 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
dictionary: v6::Setting::NotSet,
synonyms: settings.synonyms.into(),
distinct_attribute: settings.distinct_attribute.into(),
proximity_precision: v6::Setting::NotSet,
typo_tolerance: match settings.typo_tolerance {
v5::Setting::Set(typo) => v6::Setting::Set(v6::TypoTolerance {
enabled: typo.enabled.into(),
@ -378,7 +377,6 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
v5::Setting::Reset => v6::Setting::Reset,
v5::Setting::NotSet => v6::Setting::NotSet,
},
embedders: v6::Setting::NotSet,
_kind: std::marker::PhantomData,
}
}

View File

@ -13,12 +13,12 @@ use crate::{Result, Version};
mod compat;
mod v1;
mod v2;
mod v3;
mod v4;
mod v5;
mod v6;
pub(self) mod v1;
pub(self) mod v2;
pub(self) mod v3;
pub(self) mod v4;
pub(self) mod v5;
pub(self) mod v6;
pub type Document = serde_json::Map<String, serde_json::Value>;
pub type UpdateFile = dyn Iterator<Item = Result<Document>>;

View File

@ -56,7 +56,8 @@ pub enum RankingRule {
Desc(String),
}
static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"(asc|desc)\(([\w_-]+)\)").unwrap());
static ASC_DESC_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap());
impl FromStr for RankingRule {
type Err = ();

View File

@ -564,10 +564,10 @@ pub mod tests {
#[test]
fn parse_escaped() {
insta::assert_display_snapshot!(p(r"title = 'foo\\'"), @r#"{title} = {foo\}"#);
insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#);
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#);
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#);
insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
// but it also works with other sequencies
insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
}

View File

@ -270,8 +270,8 @@ pub mod test {
("aaaa", "", rtok("", "aaaa"), "aaaa"),
(r#"aa"aa"#, r#""aa"#, rtok("", "aa"), "aa"),
(r#"aa\"aa"#, r#""#, rtok("", r#"aa\"aa"#), r#"aa"aa"#),
(r"aa\\\aa", r#""#, rtok("", r"aa\\\aa"), r"aa\\\aa"),
(r#"aa\\"\aa"#, r#""\aa"#, rtok("", r"aa\\"), r"aa\\"),
(r#"aa\\\aa"#, r#""#, rtok("", r#"aa\\\aa"#), r#"aa\\\aa"#),
(r#"aa\\"\aa"#, r#""\aa"#, rtok("", r#"aa\\"#), r#"aa\\"#),
(r#"aa\\\"\aa"#, r#""#, rtok("", r#"aa\\\"\aa"#), r#"aa\\"\aa"#),
(r#"\"\""#, r#""#, rtok("", r#"\"\""#), r#""""#),
];
@ -301,12 +301,12 @@ pub mod test {
);
// simple quote
assert_eq!(
unescape(Span::new_extra(r"Hello \'World\'", ""), '\''),
unescape(Span::new_extra(r#"Hello \'World\'"#, ""), '\''),
r#"Hello 'World'"#.to_string()
);
assert_eq!(
unescape(Span::new_extra(r"Hello \\\'World\\\'", ""), '\''),
r"Hello \\'World\\'".to_string()
unescape(Span::new_extra(r#"Hello \\\'World\\\'"#, ""), '\''),
r#"Hello \\'World\\'"#.to_string()
);
}
@ -335,19 +335,19 @@ pub mod test {
("\"cha'nnel\"", "cha'nnel", false),
("I'm tamo", "I", false),
// escaped thing but not quote
(r#""\\""#, r"\", true),
(r#""\\\\\\""#, r"\\\", true),
(r#""aa\\aa""#, r"aa\aa", true),
(r#""\\""#, r#"\"#, true),
(r#""\\\\\\""#, r#"\\\"#, true),
(r#""aa\\aa""#, r#"aa\aa"#, true),
// with double quote
(r#""Hello \"world\"""#, r#"Hello "world""#, true),
(r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
(r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
(r#""\"\"""#, r#""""#, true),
// with simple quote
(r"'Hello \'world\''", r#"Hello 'world'"#, true),
(r"'Hello \\\'world\\\''", r"Hello \'world\'", true),
(r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
(r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
(r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
(r"'\'\''", r#"''"#, true),
(r#"'\'\''"#, r#"''"#, true),
];
for (input, expected, escaped) in test_case {

View File

@ -113,7 +113,7 @@ fn main() {
index.documents(&wtxn, res.documents_ids).unwrap();
progression.fetch_add(1, Ordering::Relaxed);
}
wtxn.abort();
wtxn.abort().unwrap();
});
if let err @ Err(_) = handle.join() {
stop.store(true, Ordering::Relaxed);

View File

@ -32,7 +32,7 @@ use meilisearch_types::milli::heed::CompactionOption;
use meilisearch_types::milli::update::{
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
};
use meilisearch_types::milli::{self, Filter};
use meilisearch_types::milli::{self, Filter, BEU32};
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
@ -584,9 +584,7 @@ impl IndexScheduler {
let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued;
// If autobatching is disabled we only take one task at a time.
// Otherwise, we take only a maximum of tasks to create batches.
let tasks_limit =
if self.autobatching_enabled { self.max_number_of_batched_tasks } else { 1 };
let tasks_limit = if self.autobatching_enabled { usize::MAX } else { 1 };
let enqueued = index_tasks
.into_iter()
@ -717,7 +715,7 @@ impl IndexScheduler {
// 2. Snapshot the index-scheduler LMDB env
//
// When we call copy_to_file, LMDB opens a read transaction by itself,
// When we call copy_to_path, LMDB opens a read transaction by itself,
// we can't provide our own. It is an issue as we would like to know
// the update files to copy but new ones can be enqueued between the copy
// of the env and the new transaction we open to retrieve the enqueued tasks.
@ -730,7 +728,7 @@ impl IndexScheduler {
// 2.1 First copy the LMDB env of the index-scheduler
let dst = temp_snapshot_dir.path().join("tasks");
fs::create_dir_all(&dst)?;
self.env.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
self.env.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?;
// 2.2 Create a read transaction on the index-scheduler
let rtxn = self.env.read_txn()?;
@ -755,7 +753,7 @@ impl IndexScheduler {
let index = self.index_mapper.index(&rtxn, name)?;
let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string());
fs::create_dir_all(&dst)?;
index.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
index.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?;
}
drop(rtxn);
@ -768,7 +766,7 @@ impl IndexScheduler {
.map_size(1024 * 1024 * 1024) // 1 GiB
.max_dbs(2)
.open(&self.auth_path)?;
auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
auth.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?;
// 5. Copy and tarball the flat snapshot
// 5.1 Find the original name of the database
@ -936,8 +934,8 @@ impl IndexScheduler {
};
// the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick
self.index_mapper
.set_currently_updating_index(Some((index_uid.clone(), index.clone())));
*self.currently_updating_index.write().unwrap() =
Some((index_uid.clone(), index.clone()));
let mut index_wtxn = index.write_txn()?;
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
@ -1108,7 +1106,7 @@ impl IndexScheduler {
for task_id in &index_lhs_task_ids | &index_rhs_task_ids {
let mut task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
swap_index_uid_in_task(&mut task, (lhs, rhs));
self.all_tasks.put(wtxn, &task_id, &task)?;
self.all_tasks.put(wtxn, &BEU32::new(task_id), &task)?;
}
// 4. remove the task from indexuid = before_name
@ -1134,7 +1132,7 @@ impl IndexScheduler {
/// The list of processed tasks.
fn apply_index_operation<'i>(
&self,
index_wtxn: &mut RwTxn<'i>,
index_wtxn: &mut RwTxn<'i, '_>,
index: &'i Index,
operation: IndexOperation,
) -> Result<Vec<Task>> {
@ -1202,10 +1200,6 @@ impl IndexScheduler {
let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
let embedder_configs = index.embedding_configs(index_wtxn)?;
// TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
let embedders = self.embedders(embedder_configs)?;
let mut builder = milli::update::IndexDocuments::new(
index_wtxn,
index,
@ -1224,8 +1218,6 @@ impl IndexScheduler {
let (new_builder, user_result) = builder.add_documents(reader)?;
builder = new_builder;
builder = builder.with_embedders(embedders.clone());
let received_documents =
if let Some(Details::DocumentAdditionOrUpdate {
received_documents,
@ -1487,9 +1479,10 @@ impl IndexScheduler {
}
for task in to_delete_tasks.iter() {
self.all_tasks.delete(wtxn, &task)?;
self.all_tasks.delete(wtxn, &BEU32::new(task))?;
}
for canceled_by in affected_canceled_by {
let canceled_by = BEU32::new(canceled_by);
if let Some(mut tasks) = self.canceled_by.get(wtxn, &canceled_by)? {
tasks -= &to_delete_tasks;
if tasks.is_empty() {
@ -1537,14 +1530,14 @@ impl IndexScheduler {
task.details = task.details.map(|d| d.to_failed());
self.update_task(wtxn, &task)?;
}
self.canceled_by.put(wtxn, &cancel_task_id, &tasks_to_cancel)?;
self.canceled_by.put(wtxn, &BEU32::new(cancel_task_id), &tasks_to_cancel)?;
Ok(content_files_to_delete)
}
}
fn delete_document_by_filter<'a>(
wtxn: &mut RwTxn<'a>,
wtxn: &mut RwTxn<'a, '_>,
filter: &serde_json::Value,
indexer_config: &IndexerConfig,
must_stop_processing: MustStopProcessing,

View File

@ -56,12 +56,12 @@ impl RoFeatures {
}
}
pub fn check_vector(&self, disabled_action: &'static str) -> Result<()> {
pub fn check_vector(&self) -> Result<()> {
if self.runtime.vector_store {
Ok(())
} else {
Err(FeatureNotEnabledError {
disabled_action,
disabled_action: "Passing `vector` as a query parameter",
feature: "vector store",
issue_link: "https://github.com/meilisearch/product/discussions/677",
}

View File

@ -1,8 +1,12 @@
/// the map size to use when we don't succeed in reading it in indexes.
const DEFAULT_MAP_SIZE: usize = 10 * 1024 * 1024 * 1024; // 10 GiB
use std::collections::BTreeMap;
use std::path::Path;
use std::time::Duration;
use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions};
use meilisearch_types::heed::flags::Flags;
use meilisearch_types::heed::{EnvClosingEvent, EnvOpenOptions};
use meilisearch_types::milli::Index;
use time::OffsetDateTime;
use uuid::Uuid;
@ -232,7 +236,7 @@ impl IndexMap {
enable_mdb_writemap: bool,
map_size_growth: usize,
) {
let map_size = index.map_size() + map_size_growth;
let map_size = index.map_size().unwrap_or(DEFAULT_MAP_SIZE) + map_size_growth;
let closing_event = index.prepare_for_closing();
let generation = self.next_generation();
self.unavailable.insert(
@ -305,7 +309,7 @@ fn create_or_open_index(
options.map_size(clamp_to_page_size(map_size));
options.max_readers(1024);
if enable_mdb_writemap {
unsafe { options.flags(EnvFlags::WRITE_MAP) };
unsafe { options.flag(Flags::MdbWriteMap) };
}
if let Some((created, updated)) = date {
@ -384,7 +388,7 @@ mod tests {
fn assert_index_size(index: Index, expected: usize) {
let expected = clamp_to_page_size(expected);
let index_map_size = index.map_size();
let index_map_size = index.map_size().unwrap();
assert_eq!(index_map_size, expected);
}
}

View File

@ -69,10 +69,6 @@ pub struct IndexMapper {
/// Whether we open a meilisearch index with the MDB_WRITEMAP option or not.
enable_mdb_writemap: bool,
pub indexer_config: Arc<IndexerConfig>,
/// A few types of long running batches of tasks that act on a single index set this field
/// so that a handle to the index is available from other threads (search) in an optimized manner.
currently_updating_index: Arc<RwLock<Option<(String, Index)>>>,
}
/// Whether the index is available for use or is forbidden to be inserted back in the index map
@ -155,7 +151,6 @@ impl IndexMapper {
index_growth_amount,
enable_mdb_writemap,
indexer_config: Arc::new(indexer_config),
currently_updating_index: Default::default(),
})
}
@ -308,14 +303,6 @@ impl IndexMapper {
/// Return an index, may open it if it wasn't already opened.
pub fn index(&self, rtxn: &RoTxn, name: &str) -> Result<Index> {
if let Some((current_name, current_index)) =
self.currently_updating_index.read().unwrap().as_ref()
{
if current_name == name {
return Ok(current_index.clone());
}
}
let uuid = self
.index_mapping
.get(rtxn, name)?
@ -487,8 +474,4 @@ impl IndexMapper {
pub fn indexer_config(&self) -> &IndexerConfig {
&self.indexer_config
}
pub fn set_currently_updating_index(&self, index: Option<(String, Index)>) {
*self.currently_updating_index.write().unwrap() = index;
}
}

View File

@ -1,7 +1,7 @@
use std::collections::BTreeSet;
use std::fmt::Write;
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str};
use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str};
use meilisearch_types::heed::{Database, RoTxn};
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Details, Task};
@ -30,7 +30,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
index_mapper,
features: _,
max_number_of_tasks: _,
max_number_of_batched_tasks: _,
puffin_frame: _,
wake_up: _,
dumps_path: _,
@ -38,11 +37,10 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
auth_path: _,
version_file_path: _,
webhook_url: _,
webhook_authorization_header: _,
test_breakpoint_sdr: _,
planned_failures: _,
run_loop_iteration: _,
embedders: _,
currently_updating_index: _,
} = scheduler;
let rtxn = env.read_txn().unwrap();
@ -118,7 +116,7 @@ pub fn snapshot_bitmap(r: &RoaringBitmap) -> String {
snap
}
pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database<BEU32, SerdeJson<Task>>) -> String {
pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database<OwnedType<BEU32>, SerdeJson<Task>>) -> String {
let mut snap = String::new();
let iter = db.iter(rtxn).unwrap();
for next in iter {
@ -128,7 +126,10 @@ pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database<BEU32, SerdeJson<Task>>) ->
snap
}
pub fn snapshot_date_db(rtxn: &RoTxn, db: Database<BEI128, CboRoaringBitmapCodec>) -> String {
pub fn snapshot_date_db(
rtxn: &RoTxn,
db: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
) -> String {
let mut snap = String::new();
let iter = db.iter(rtxn).unwrap();
for next in iter {
@ -248,7 +249,10 @@ pub fn snapshot_index_tasks(rtxn: &RoTxn, db: Database<Str, RoaringBitmapCodec>)
}
snap
}
pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database<BEU32, RoaringBitmapCodec>) -> String {
pub fn snapshot_canceled_by(
rtxn: &RoTxn,
db: Database<OwnedType<BEU32>, RoaringBitmapCodec>,
) -> String {
let mut snap = String::new();
let iter = db.iter(rtxn).unwrap();
for next in iter {

View File

@ -50,12 +50,10 @@ use flate2::bufread::GzEncoder;
use flate2::Compression;
use meilisearch_types::error::ResponseError;
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
use meilisearch_types::heed::byteorder::BE;
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128};
use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn};
use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str};
use meilisearch_types::heed::{self, Database, Env, RoTxn, RwTxn};
use meilisearch_types::milli::documents::DocumentsBatchBuilder;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
use meilisearch_types::task_view::TaskView;
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
@ -70,7 +68,8 @@ use uuid::Uuid;
use crate::index_mapper::IndexMapper;
use crate::utils::{check_index_swap_validity, clamp_to_page_size};
pub(crate) type BEI128 = I128<BE>;
pub(crate) type BEI128 =
meilisearch_types::heed::zerocopy::I128<meilisearch_types::heed::byteorder::BE>;
/// Defines a subset of tasks to be retrieved from the [`IndexScheduler`].
///
@ -245,10 +244,7 @@ pub struct IndexSchedulerOptions {
pub snapshots_path: PathBuf,
/// The path to the folder containing the dumps.
pub dumps_path: PathBuf,
/// The URL on which we must send the tasks statuses
pub webhook_url: Option<String>,
/// The value we will send into the Authorization HTTP header on the webhook URL
pub webhook_authorization_header: Option<String>,
/// The maximum size, in bytes, of the task index.
pub task_db_size: usize,
/// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index.
@ -267,9 +263,6 @@ pub struct IndexSchedulerOptions {
/// The maximum number of tasks stored in the task queue before starting
/// to auto schedule task deletions.
pub max_number_of_tasks: usize,
/// If the autobatcher is allowed to automatically batch tasks
/// it will only batch this defined number of tasks at once.
pub max_number_of_batched_tasks: usize,
/// The experimental features enabled for this instance.
pub instance_features: InstanceTogglableFeatures,
}
@ -290,7 +283,7 @@ pub struct IndexScheduler {
pub(crate) file_store: FileStore,
// The main database, it contains all the tasks accessible by their Id.
pub(crate) all_tasks: Database<BEU32, SerdeJson<Task>>,
pub(crate) all_tasks: Database<OwnedType<BEU32>, SerdeJson<Task>>,
/// All the tasks ids grouped by their status.
// TODO we should not be able to serialize a `Status::Processing` in this database.
@ -301,16 +294,16 @@ pub struct IndexScheduler {
pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
/// Store the tasks that were canceled by a task uid
pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
pub(crate) canceled_by: Database<OwnedType<BEU32>, RoaringBitmapCodec>,
/// Store the task ids of tasks which were enqueued at a specific date
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) enqueued_at: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
/// Store the task ids of finished tasks which started being processed at a specific date
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) started_at: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
/// Store the task ids of tasks which finished at a specific date
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) finished_at: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
/// In charge of creating, opening, storing and returning indexes.
pub(crate) index_mapper: IndexMapper,
@ -328,13 +321,8 @@ pub struct IndexScheduler {
/// the finished tasks automatically.
pub(crate) max_number_of_tasks: usize,
/// The maximum number of tasks that will be batched together.
pub(crate) max_number_of_batched_tasks: usize,
/// The webhook url we should send tasks to after processing every batches.
pub(crate) webhook_url: Option<String>,
/// The Authorization header to send to the webhook URL.
pub(crate) webhook_authorization_header: Option<String>,
/// A frame to output the indexation profiling files to disk.
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
@ -351,7 +339,9 @@ pub struct IndexScheduler {
/// The path to the version file of Meilisearch.
pub(crate) version_file_path: PathBuf,
embedders: Arc<RwLock<HashMap<EmbedderOptions, Arc<Embedder>>>>,
/// A few types of long running batches of tasks that act on a single index set this field
/// so that a handle to the index is available from other threads (search) in an optimized manner.
currently_updating_index: Arc<RwLock<Option<(String, Index)>>>,
// ================= test
// The next entry is dedicated to the tests.
@ -391,15 +381,13 @@ impl IndexScheduler {
wake_up: self.wake_up.clone(),
autobatching_enabled: self.autobatching_enabled,
max_number_of_tasks: self.max_number_of_tasks,
max_number_of_batched_tasks: self.max_number_of_batched_tasks,
puffin_frame: self.puffin_frame.clone(),
snapshots_path: self.snapshots_path.clone(),
dumps_path: self.dumps_path.clone(),
auth_path: self.auth_path.clone(),
version_file_path: self.version_file_path.clone(),
webhook_url: self.webhook_url.clone(),
webhook_authorization_header: self.webhook_authorization_header.clone(),
embedders: self.embedders.clone(),
currently_updating_index: self.currently_updating_index.clone(),
#[cfg(test)]
test_breakpoint_sdr: self.test_breakpoint_sdr.clone(),
#[cfg(test)]
@ -492,14 +480,12 @@ impl IndexScheduler {
puffin_frame: Arc::new(puffin::GlobalFrameView::default()),
autobatching_enabled: options.autobatching_enabled,
max_number_of_tasks: options.max_number_of_tasks,
max_number_of_batched_tasks: options.max_number_of_batched_tasks,
dumps_path: options.dumps_path,
snapshots_path: options.snapshots_path,
auth_path: options.auth_path,
version_file_path: options.version_file_path,
webhook_url: options.webhook_url,
webhook_authorization_header: options.webhook_authorization_header,
embedders: Default::default(),
currently_updating_index: Arc::new(RwLock::new(None)),
#[cfg(test)]
test_breakpoint_sdr,
@ -682,6 +668,13 @@ impl IndexScheduler {
/// If you need to fetch information from or perform an action on all indexes,
/// see the `try_for_each_index` function.
pub fn index(&self, name: &str) -> Result<Index> {
if let Some((current_name, current_index)) =
self.currently_updating_index.read().unwrap().as_ref()
{
if current_name == name {
return Ok(current_index.clone());
}
}
let rtxn = self.env.read_txn()?;
self.index_mapper.index(&rtxn, name)
}
@ -747,7 +740,9 @@ impl IndexScheduler {
if let Some(canceled_by) = &query.canceled_by {
let mut all_canceled_tasks = RoaringBitmap::new();
for cancel_task_uid in canceled_by {
if let Some(canceled_by_uid) = self.canceled_by.get(rtxn, cancel_task_uid)? {
if let Some(canceled_by_uid) =
self.canceled_by.get(rtxn, &BEU32::new(*cancel_task_uid))?
{
all_canceled_tasks |= canceled_by_uid;
}
}
@ -998,7 +993,7 @@ impl IndexScheduler {
// if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
&& (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 50
&& (self.env.non_free_pages_size()? * 100) / self.env.map_size()? as u64 > 50
{
return Err(Error::NoSpaceLeftInTaskQueue);
}
@ -1024,7 +1019,7 @@ impl IndexScheduler {
// Get rid of the mutability.
let task = task;
self.all_tasks.put_with_flags(&mut wtxn, PutFlags::APPEND, &task.uid, &task)?;
self.all_tasks.append(&mut wtxn, &BEU32::new(task.uid), &task)?;
for index in task.indexes() {
self.update_index(&mut wtxn, index, |bitmap| {
@ -1162,7 +1157,7 @@ impl IndexScheduler {
};
// Reset the currently updating index to relinquish the index handle
self.index_mapper.set_currently_updating_index(None);
*self.currently_updating_index.write().unwrap() = None;
#[cfg(test)]
self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?;
@ -1202,7 +1197,7 @@ impl IndexScheduler {
| Err(Error::AbortedTask) => {
#[cfg(test)]
self.breakpoint(Breakpoint::AbortedIndexation);
wtxn.abort();
wtxn.abort().map_err(Error::HeedTransaction)?;
// We make sure that we don't call `stop_processing` on the `processing_tasks`,
// this is because we want to let the next tick call `create_next_batch` and keep
@ -1223,7 +1218,7 @@ impl IndexScheduler {
let index_uid = index_uid.unwrap();
// fixme: handle error more gracefully? not sure when this could happen
self.index_mapper.resize_index(&wtxn, &index_uid)?;
wtxn.abort();
wtxn.abort().map_err(Error::HeedTransaction)?;
return Ok(TickOutcome::TickAgain(0));
}
@ -1332,15 +1327,8 @@ impl IndexScheduler {
written: 0,
};
// let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default());
let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default());
let request = ureq::post(url).set("Content-Encoding", "gzip");
let request = match &self.webhook_authorization_header {
Some(header) => request.set("Authorization", header),
None => request,
};
if let Err(e) = request.send(reader) {
if let Err(e) = ureq::post(url).set("Content-Encoding", "gzip").send(reader) {
log::error!("While sending data to the webhook: {e}");
}
}
@ -1422,40 +1410,6 @@ impl IndexScheduler {
}
}
// TODO: consider using a type alias or a struct embedder/template
pub fn embedders(
&self,
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>,
) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| {
let prompt =
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
// optimistically return existing embedder
{
let embedders = self.embedders.read().unwrap();
if let Some(embedder) = embedders.get(&embedder_options) {
return Ok((name, (embedder.clone(), prompt)));
}
}
// add missing embedder
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(meilisearch_types::milli::vector::Error::from)
.map_err(meilisearch_types::milli::Error::from)?,
);
{
let mut embedders = self.embedders.write().unwrap();
embedders.insert(embedder_options, embedder.clone());
}
Ok((name, (embedder, prompt)))
})
.collect();
res.map(EmbeddingConfigs::new)
}
/// Blocks the thread until the test handle asks to progress to/through this breakpoint.
///
/// Two messages are sent through the channel for each breakpoint.
@ -1483,7 +1437,7 @@ impl IndexScheduler {
pub struct Dump<'a> {
index_scheduler: &'a IndexScheduler,
wtxn: RwTxn<'a>,
wtxn: RwTxn<'a, 'a>,
indexes: HashMap<String, RoaringBitmap>,
statuses: HashMap<Status, RoaringBitmap>,
@ -1598,7 +1552,7 @@ impl<'a> Dump<'a> {
},
};
self.index_scheduler.all_tasks.put(&mut self.wtxn, &task.uid, &task)?;
self.index_scheduler.all_tasks.put(&mut self.wtxn, &BEU32::new(task.uid), &task)?;
for index in task.indexes() {
match self.indexes.get_mut(index) {
@ -1640,8 +1594,8 @@ impl<'a> Dump<'a> {
}
}
self.statuses.entry(task.status).or_default().insert(task.uid);
self.kinds.entry(task.kind.as_kind()).or_default().insert(task.uid);
self.statuses.entry(task.status).or_insert(RoaringBitmap::new()).insert(task.uid);
self.kinds.entry(task.kind.as_kind()).or_insert(RoaringBitmap::new()).insert(task.uid);
Ok(task)
}
@ -1762,7 +1716,6 @@ mod tests {
snapshots_path: tempdir.path().join("snapshots"),
dumps_path: tempdir.path().join("dumps"),
webhook_url: None,
webhook_authorization_header: None,
task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
enable_mdb_writemap: false,
@ -1771,7 +1724,6 @@ mod tests {
indexer_config,
autobatching_enabled: true,
max_number_of_tasks: 1_000_000,
max_number_of_batched_tasks: usize::MAX,
instance_features: Default::default(),
};
configuration(&mut options);

View File

@ -3,9 +3,9 @@
use std::collections::{BTreeSet, HashSet};
use std::ops::Bound;
use meilisearch_types::heed::types::DecodeIgnore;
use meilisearch_types::heed::types::{DecodeIgnore, OwnedType};
use meilisearch_types::heed::{Database, RoTxn, RwTxn};
use meilisearch_types::milli::CboRoaringBitmapCodec;
use meilisearch_types::milli::{CboRoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status};
use roaring::{MultiOps, RoaringBitmap};
use time::OffsetDateTime;
@ -18,7 +18,7 @@ impl IndexScheduler {
}
pub(crate) fn last_task_id(&self, rtxn: &RoTxn) -> Result<Option<TaskId>> {
Ok(self.all_tasks.remap_data_type::<DecodeIgnore>().last(rtxn)?.map(|(k, _)| k + 1))
Ok(self.all_tasks.remap_data_type::<DecodeIgnore>().last(rtxn)?.map(|(k, _)| k.get() + 1))
}
pub(crate) fn next_task_id(&self, rtxn: &RoTxn) -> Result<TaskId> {
@ -26,7 +26,7 @@ impl IndexScheduler {
}
pub(crate) fn get_task(&self, rtxn: &RoTxn, task_id: TaskId) -> Result<Option<Task>> {
Ok(self.all_tasks.get(rtxn, &task_id)?)
Ok(self.all_tasks.get(rtxn, &BEU32::new(task_id))?)
}
/// Convert an iterator to a `Vec` of tasks. The tasks MUST exist or a
@ -88,7 +88,7 @@ impl IndexScheduler {
}
}
self.all_tasks.put(wtxn, &task.uid, task)?;
self.all_tasks.put(wtxn, &BEU32::new(task.uid), task)?;
Ok(())
}
@ -169,11 +169,11 @@ impl IndexScheduler {
pub(crate) fn insert_task_datetime(
wtxn: &mut RwTxn,
database: Database<BEI128, CboRoaringBitmapCodec>,
database: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
time: OffsetDateTime,
task_id: TaskId,
) -> Result<()> {
let timestamp = time.unix_timestamp_nanos();
let timestamp = BEI128::new(time.unix_timestamp_nanos());
let mut task_ids = database.get(wtxn, &timestamp)?.unwrap_or_default();
task_ids.insert(task_id);
database.put(wtxn, &timestamp, &RoaringBitmap::from_iter(task_ids))?;
@ -182,11 +182,11 @@ pub(crate) fn insert_task_datetime(
pub(crate) fn remove_task_datetime(
wtxn: &mut RwTxn,
database: Database<BEI128, CboRoaringBitmapCodec>,
database: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
time: OffsetDateTime,
task_id: TaskId,
) -> Result<()> {
let timestamp = time.unix_timestamp_nanos();
let timestamp = BEI128::new(time.unix_timestamp_nanos());
if let Some(mut existing) = database.get(wtxn, &timestamp)? {
existing.remove(task_id);
if existing.is_empty() {
@ -202,7 +202,7 @@ pub(crate) fn remove_task_datetime(
pub(crate) fn keep_tasks_within_datetimes(
rtxn: &RoTxn,
tasks: &mut RoaringBitmap,
database: Database<BEI128, CboRoaringBitmapCodec>,
database: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
after: Option<OffsetDateTime>,
before: Option<OffsetDateTime>,
) -> Result<()> {
@ -213,8 +213,8 @@ pub(crate) fn keep_tasks_within_datetimes(
(Some(after), Some(before)) => (Bound::Excluded(*after), Bound::Excluded(*before)),
};
let mut collected_task_ids = RoaringBitmap::new();
let start = map_bound(start, |b| b.unix_timestamp_nanos());
let end = map_bound(end, |b| b.unix_timestamp_nanos());
let start = map_bound(start, |b| BEI128::new(b.unix_timestamp_nanos()));
let end = map_bound(end, |b| BEI128::new(b.unix_timestamp_nanos()));
let iter = database.range(rtxn, &(start, end))?;
for r in iter {
let (_timestamp, task_ids) = r?;
@ -337,6 +337,8 @@ impl IndexScheduler {
let rtxn = self.env.read_txn().unwrap();
for task in self.all_tasks.iter(&rtxn).unwrap() {
let (task_id, task) = task.unwrap();
let task_id = task_id.get();
let task_index_uid = task.index_uid().map(ToOwned::to_owned);
let Task {
@ -359,13 +361,16 @@ impl IndexScheduler {
.unwrap()
.contains(task.uid));
}
let db_enqueued_at =
self.enqueued_at.get(&rtxn, &enqueued_at.unix_timestamp_nanos()).unwrap().unwrap();
let db_enqueued_at = self
.enqueued_at
.get(&rtxn, &BEI128::new(enqueued_at.unix_timestamp_nanos()))
.unwrap()
.unwrap();
assert!(db_enqueued_at.contains(task_id));
if let Some(started_at) = started_at {
let db_started_at = self
.started_at
.get(&rtxn, &started_at.unix_timestamp_nanos())
.get(&rtxn, &BEI128::new(started_at.unix_timestamp_nanos()))
.unwrap()
.unwrap();
assert!(db_started_at.contains(task_id));
@ -373,7 +378,7 @@ impl IndexScheduler {
if let Some(finished_at) = finished_at {
let db_finished_at = self
.finished_at
.get(&rtxn, &finished_at.unix_timestamp_nanos())
.get(&rtxn, &BEI128::new(finished_at.unix_timestamp_nanos()))
.unwrap()
.unwrap();
assert!(db_finished_at.contains(task_id));

View File

@ -1,7 +1,7 @@
use std::borrow::Cow;
use std::convert::TryInto;
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
use meilisearch_types::heed::{BytesDecode, BytesEncode};
use uuid::Uuid;
/// A heed codec for value of struct Uuid.
@ -10,15 +10,15 @@ pub struct UuidCodec;
impl<'a> BytesDecode<'a> for UuidCodec {
type DItem = Uuid;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
bytes.try_into().map(Uuid::from_bytes).map_err(Into::into)
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
bytes.try_into().ok().map(Uuid::from_bytes)
}
}
impl BytesEncode<'_> for UuidCodec {
type EItem = Uuid;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
Ok(Cow::Borrowed(item.as_bytes()))
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
Some(Cow::Borrowed(item.as_bytes()))
}
}

View File

@ -4,20 +4,17 @@ use std::collections::HashSet;
use std::convert::{TryFrom, TryInto};
use std::fs::create_dir_all;
use std::path::Path;
use std::result::Result as StdResult;
use std::str;
use std::str::FromStr;
use std::sync::Arc;
use hmac::{Hmac, Mac};
use meilisearch_types::heed::BoxedError;
use meilisearch_types::index_uid_pattern::IndexUidPattern;
use meilisearch_types::keys::KeyId;
use meilisearch_types::milli;
use meilisearch_types::milli::heed::types::{Bytes, DecodeIgnore, SerdeJson};
use meilisearch_types::milli::heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
use meilisearch_types::milli::heed::{Database, Env, EnvOpenOptions, RwTxn};
use sha2::Sha256;
use thiserror::Error;
use time::OffsetDateTime;
use uuid::fmt::Hyphenated;
use uuid::Uuid;
@ -33,7 +30,7 @@ const KEY_ID_ACTION_INDEX_EXPIRATION_DB_NAME: &str = "keyid-action-index-expirat
#[derive(Clone)]
pub struct HeedAuthStore {
env: Arc<Env>,
keys: Database<Bytes, SerdeJson<Key>>,
keys: Database<ByteSlice, SerdeJson<Key>>,
action_keyid_index_expiration: Database<KeyIdActionCodec, SerdeJson<Option<OffsetDateTime>>>,
should_close_on_drop: bool,
}
@ -279,7 +276,7 @@ impl HeedAuthStore {
fn delete_key_from_inverted_db(&self, wtxn: &mut RwTxn, key: &KeyId) -> Result<()> {
let mut iter = self
.action_keyid_index_expiration
.remap_types::<Bytes, DecodeIgnore>()
.remap_types::<ByteSlice, DecodeIgnore>()
.prefix_iter_mut(wtxn, key.as_bytes())?;
while iter.next().transpose()?.is_some() {
// safety: we don't keep references from inside the LMDB database.
@ -297,24 +294,23 @@ pub struct KeyIdActionCodec;
impl<'a> milli::heed::BytesDecode<'a> for KeyIdActionCodec {
type DItem = (KeyId, Action, Option<&'a [u8]>);
fn bytes_decode(bytes: &'a [u8]) -> StdResult<Self::DItem, BoxedError> {
let (key_id_bytes, action_bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
let (&action_byte, index) =
match try_split_array_at(action_bytes).ok_or(SliceTooShortError)? {
([action], []) => (action, None),
([action], index) => (action, Some(index)),
};
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (key_id_bytes, action_bytes) = try_split_array_at(bytes)?;
let (action_bytes, index) = match try_split_array_at(action_bytes)? {
(action, []) => (action, None),
(action, index) => (action, Some(index)),
};
let key_id = Uuid::from_bytes(*key_id_bytes);
let action = Action::from_repr(action_byte).ok_or(InvalidActionError { action_byte })?;
let action = Action::from_repr(u8::from_be_bytes(*action_bytes))?;
Ok((key_id, action, index))
Some((key_id, action, index))
}
}
impl<'a> milli::heed::BytesEncode<'a> for KeyIdActionCodec {
type EItem = (&'a KeyId, &'a Action, Option<&'a [u8]>);
fn bytes_encode((key_id, action, index): &Self::EItem) -> StdResult<Cow<[u8]>, BoxedError> {
fn bytes_encode((key_id, action, index): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::new();
bytes.extend_from_slice(key_id.as_bytes());
@ -324,20 +320,10 @@ impl<'a> milli::heed::BytesEncode<'a> for KeyIdActionCodec {
bytes.extend_from_slice(index);
}
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}
#[derive(Error, Debug)]
#[error("the slice is too short")]
pub struct SliceTooShortError;
#[derive(Error, Debug)]
#[error("cannot construct a valid Action from {action_byte}")]
pub struct InvalidActionError {
pub action_byte: u8,
}
pub fn generate_key_as_hexa(uid: Uuid, master_key: &[u8]) -> String {
// format uid as hyphenated allowing user to generate their own keys.
let mut uid_buffer = [0; Hyphenated::LENGTH];

View File

@ -15,7 +15,7 @@ actix-web = { version = "4.3.1", default-features = false }
anyhow = "1.0.70"
convert_case = "0.6.0"
csv = "1.2.1"
deserr = { version = "0.6.0", features = ["actix-web"] }
deserr = { version = "0.6.0", features = ["actix-web"]}
either = { version = "1.8.1", features = ["serde"] }
enum-iterator = "1.4.0"
file-store = { path = "../file-store" }

View File

@ -188,4 +188,3 @@ merge_with_error_impl_take_error_message!(ParseOffsetDateTimeError);
merge_with_error_impl_take_error_message!(ParseTaskKindError);
merge_with_error_impl_take_error_message!(ParseTaskStatusError);
merge_with_error_impl_take_error_message!(IndexUidFormatError);
merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio);

View File

@ -222,8 +222,6 @@ InvalidVectorsType , InvalidRequest , BAD_REQUEST ;
InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ;
InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ;
InvalidEmbedder , InvalidRequest , BAD_REQUEST ;
InvalidHybridQuery , InvalidRequest , BAD_REQUEST ;
InvalidIndexLimit , InvalidRequest , BAD_REQUEST ;
InvalidIndexOffset , InvalidRequest , BAD_REQUEST ;
InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ;
@ -235,7 +233,6 @@ InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;
InvalidSearchSemanticRatio , InvalidRequest , BAD_REQUEST ;
InvalidFacetSearchFacetName , InvalidRequest , BAD_REQUEST ;
InvalidSearchFilter , InvalidRequest , BAD_REQUEST ;
InvalidSearchHighlightPostTag , InvalidRequest , BAD_REQUEST ;
@ -255,11 +252,9 @@ InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
InvalidSearchSort , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ;
InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ;
InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ;
InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ;
InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ;
InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSortableAttributes , InvalidRequest , BAD_REQUEST ;
@ -299,18 +294,15 @@ MissingFacetSearchFacetName , InvalidRequest , BAD_REQUEST ;
MissingIndexUid , InvalidRequest , BAD_REQUEST ;
MissingMasterKey , Auth , UNAUTHORIZED ;
MissingPayload , InvalidRequest , BAD_REQUEST ;
MissingSearchHybrid , InvalidRequest , BAD_REQUEST ;
MissingSwapIndexes , InvalidRequest , BAD_REQUEST ;
MissingTaskFilters , InvalidRequest , BAD_REQUEST ;
NoSpaceLeftOnDevice , System , UNPROCESSABLE_ENTITY;
PayloadTooLarge , InvalidRequest , PAYLOAD_TOO_LARGE ;
TaskNotFound , InvalidRequest , NOT_FOUND ;
TooManyOpenFiles , System , UNPROCESSABLE_ENTITY ;
TooManyVectors , InvalidRequest , BAD_REQUEST ;
UnretrievableDocument , Internal , BAD_REQUEST ;
UnretrievableErrorCode , InvalidRequest , BAD_REQUEST ;
UnsupportedMediaType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
VectorEmbeddingError , InvalidRequest , BAD_REQUEST
UnsupportedMediaType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE
}
impl ErrorCode for JoinError {
@ -343,13 +335,6 @@ impl ErrorCode for milli::Error {
UserError::InvalidDocumentId { .. } | UserError::TooManyDocumentIds { .. } => {
Code::InvalidDocumentId
}
UserError::MissingDocumentField(_) => Code::InvalidDocumentFields,
UserError::InvalidFieldForSource { .. }
| UserError::MissingFieldForSource { .. }
| UserError::InvalidOpenAiModel { .. }
| UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders,
UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders,
UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders,
UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound,
UserError::MultiplePrimaryKeyCandidatesFound { .. } => {
Code::IndexPrimaryKeyMultipleCandidatesFound
@ -367,15 +352,11 @@ impl ErrorCode for milli::Error {
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType,
UserError::InvalidVectorsType { .. } => Code::InvalidVectorsType,
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
UserError::SortError(_) => Code::InvalidSearchSort,
UserError::InvalidMinTypoWordLenSetting(_, _) => {
Code::InvalidSettingsTypoTolerance
}
UserError::InvalidEmbedder(_) => Code::InvalidEmbedder,
UserError::VectorEmbeddingError(_) => Code::VectorEmbeddingError,
}
}
}
@ -405,11 +386,11 @@ impl ErrorCode for HeedError {
HeedError::Mdb(MdbError::Invalid) => Code::InvalidStoreFile,
HeedError::Io(e) => e.error_code(),
HeedError::Mdb(_)
| HeedError::Encoding(_)
| HeedError::Decoding(_)
| HeedError::Encoding
| HeedError::Decoding
| HeedError::InvalidDatabaseTyping
| HeedError::DatabaseClosing
| HeedError::BadOpenOptions { .. } => Code::Internal,
| HeedError::BadOpenOptions => Code::Internal,
}
}
}
@ -463,15 +444,6 @@ impl fmt::Display for DeserrParseIntError {
}
}
impl fmt::Display for deserr_codes::InvalidSearchSemanticRatio {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`."
)
}
}
#[macro_export]
macro_rules! internal_error {
($target:ty : $($other:path), *) => {

View File

@ -8,7 +8,6 @@ use std::str::FromStr;
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
use fst::IntoStreamer;
use milli::proximity::ProximityPrecision;
use milli::update::Setting;
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
use serde::{Deserialize, Serialize, Serializer};
@ -187,9 +186,6 @@ pub struct Settings<T> {
#[deserr(default, error = DeserrJsonError<InvalidSettingsDistinctAttribute>)]
pub distinct_attribute: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsProximityPrecision>)]
pub proximity_precision: Setting<ProximityPrecisionView>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsTypoTolerance>)]
pub typo_tolerance: Setting<TypoSettings>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
@ -199,10 +195,6 @@ pub struct Settings<T> {
#[deserr(default, error = DeserrJsonError<InvalidSettingsPagination>)]
pub pagination: Setting<PaginationSettings>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsEmbedders>)]
pub embedders: Setting<BTreeMap<String, Setting<milli::vector::settings::EmbeddingSettings>>>,
#[serde(skip)]
#[deserr(skip)]
pub _kind: PhantomData<T>,
@ -222,11 +214,9 @@ impl Settings<Checked> {
separator_tokens: Setting::Reset,
dictionary: Setting::Reset,
distinct_attribute: Setting::Reset,
proximity_precision: Setting::Reset,
typo_tolerance: Setting::Reset,
faceting: Setting::Reset,
pagination: Setting::Reset,
embedders: Setting::Reset,
_kind: PhantomData,
}
}
@ -244,11 +234,9 @@ impl Settings<Checked> {
dictionary,
synonyms,
distinct_attribute,
proximity_precision,
typo_tolerance,
faceting,
pagination,
embedders,
..
} = self;
@ -264,11 +252,9 @@ impl Settings<Checked> {
dictionary,
synonyms,
distinct_attribute,
proximity_precision,
typo_tolerance,
faceting,
pagination,
embedders,
_kind: PhantomData,
}
}
@ -310,29 +296,12 @@ impl Settings<Unchecked> {
separator_tokens: self.separator_tokens,
dictionary: self.dictionary,
distinct_attribute: self.distinct_attribute,
proximity_precision: self.proximity_precision,
typo_tolerance: self.typo_tolerance,
faceting: self.faceting,
pagination: self.pagination,
embedders: self.embedders,
_kind: PhantomData,
}
}
pub fn validate(self) -> Result<Self, milli::Error> {
self.validate_embedding_settings()
}
fn validate_embedding_settings(mut self) -> Result<Self, milli::Error> {
let Setting::Set(mut configs) = self.embedders else { return Ok(self) };
for (name, config) in configs.iter_mut() {
let config_to_check = std::mem::take(config);
let checked_config = milli::update::validate_embedding_settings(config_to_check, name)?;
*config = checked_config
}
self.embedders = Setting::Set(configs);
Ok(self)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -421,12 +390,6 @@ pub fn apply_settings_to_builder(
Setting::NotSet => (),
}
match settings.proximity_precision {
Setting::Set(ref precision) => builder.set_proximity_precision((*precision).into()),
Setting::Reset => builder.reset_proximity_precision(),
Setting::NotSet => (),
}
match settings.typo_tolerance {
Setting::Set(ref value) => {
match value.enabled {
@ -513,12 +476,6 @@ pub fn apply_settings_to_builder(
Setting::Reset => builder.reset_pagination_max_total_hits(),
Setting::NotSet => (),
}
match settings.embedders.clone() {
Setting::Set(value) => builder.set_embedder_settings(value),
Setting::Reset => builder.reset_embedder_settings(),
Setting::NotSet => (),
}
}
pub fn settings(
@ -552,8 +509,6 @@ pub fn settings(
let distinct_field = index.distinct_field(rtxn)?.map(String::from);
let proximity_precision = index.proximity_precision(rtxn)?.map(ProximityPrecisionView::from);
let synonyms = index.user_defined_synonyms(rtxn)?;
let min_typo_word_len = MinWordSizeTyposSetting {
@ -577,10 +532,7 @@ pub fn settings(
let faceting = FacetingSettings {
max_values_per_facet: Setting::Set(
index
.max_values_per_facet(rtxn)?
.map(|x| x as usize)
.unwrap_or(DEFAULT_VALUES_PER_FACET),
index.max_values_per_facet(rtxn)?.unwrap_or(DEFAULT_VALUES_PER_FACET),
),
sort_facet_values_by: Setting::Set(
index
@ -593,20 +545,10 @@ pub fn settings(
let pagination = PaginationSettings {
max_total_hits: Setting::Set(
index
.pagination_max_total_hits(rtxn)?
.map(|x| x as usize)
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
index.pagination_max_total_hits(rtxn)?.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
),
};
let embedders: BTreeMap<_, _> = index
.embedding_configs(rtxn)?
.into_iter()
.map(|(name, config)| (name, Setting::Set(config.into())))
.collect();
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
Ok(Settings {
displayed_attributes: match displayed_attributes {
Some(attrs) => Setting::Set(attrs),
@ -627,12 +569,10 @@ pub fn settings(
Some(field) => Setting::Set(field),
None => Setting::Reset,
},
proximity_precision: Setting::Set(proximity_precision.unwrap_or_default()),
synonyms: Setting::Set(synonyms),
typo_tolerance: Setting::Set(typo_tolerance),
faceting: Setting::Set(faceting),
pagination: Setting::Set(pagination),
embedders,
_kind: PhantomData,
})
}
@ -733,32 +673,6 @@ impl From<RankingRuleView> for Criterion {
}
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)]
#[serde(deny_unknown_fields, rename_all = "camelCase")]
#[deserr(error = DeserrJsonError<InvalidSettingsProximityPrecision>, rename_all = camelCase, deny_unknown_fields)]
pub enum ProximityPrecisionView {
#[default]
ByWord,
ByAttribute,
}
impl From<ProximityPrecision> for ProximityPrecisionView {
fn from(value: ProximityPrecision) -> Self {
match value {
ProximityPrecision::ByWord => ProximityPrecisionView::ByWord,
ProximityPrecision::ByAttribute => ProximityPrecisionView::ByAttribute,
}
}
}
impl From<ProximityPrecisionView> for ProximityPrecision {
fn from(value: ProximityPrecisionView) -> Self {
match value {
ProximityPrecisionView::ByWord => ProximityPrecision::ByWord,
ProximityPrecisionView::ByAttribute => ProximityPrecision::ByAttribute,
}
}
}
#[cfg(test)]
pub(crate) mod test {
use super::*;
@ -778,11 +692,9 @@ pub(crate) mod test {
dictionary: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
proximity_precision: Setting::NotSet,
typo_tolerance: Setting::NotSet,
faceting: Setting::NotSet,
pagination: Setting::NotSet,
embedders: Setting::NotSet,
_kind: PhantomData::<Unchecked>,
};
@ -804,11 +716,9 @@ pub(crate) mod test {
dictionary: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
proximity_precision: Setting::NotSet,
typo_tolerance: Setting::NotSet,
faceting: Setting::NotSet,
pagination: Setting::NotSet,
embedders: Setting::NotSet,
_kind: PhantomData::<Unchecked>,
};

View File

@ -39,7 +39,7 @@ byte-unit = { version = "4.0.19", default-features = false, features = [
bytes = "1.4.0"
clap = { version = "4.2.1", features = ["derive", "env"] }
crossbeam-channel = "0.5.8"
deserr = { version = "0.6.0", features = ["actix-web"] }
deserr = { version = "0.6.0", features = ["actix-web"]}
dump = { path = "../dump" }
either = "1.8.1"
env_logger = "0.10.0"
@ -154,5 +154,5 @@ greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.12/build.zip"
sha1 = "acfe9a018c93eb0604ea87ee87bff7df5474e18e"
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip"
sha1 = "83cd44ed1e5f97ecb581dc9f958a63f4ccc982d9"

View File

@ -36,7 +36,7 @@ use crate::routes::{create_all_stats, Stats};
use crate::search::{
FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult,
DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEMANTIC_RATIO,
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
};
use crate::Opt;
@ -251,7 +251,6 @@ struct Infos {
env: String,
experimental_enable_metrics: bool,
experimental_reduce_indexing_memory_usage: bool,
experimental_max_number_of_batched_tasks: usize,
db_path: bool,
import_dump: bool,
dump_dir: bool,
@ -264,8 +263,6 @@ struct Infos {
ignore_snapshot_if_db_exists: bool,
http_addr: bool,
http_payload_size_limit: Byte,
task_queue_webhook: bool,
task_webhook_authorization_header: bool,
log_level: String,
max_indexing_memory: MaxMemory,
max_indexing_threads: MaxThreads,
@ -288,12 +285,10 @@ impl From<Opt> for Infos {
db_path,
experimental_enable_metrics,
experimental_reduce_indexing_memory_usage,
experimental_max_number_of_batched_tasks,
http_addr,
master_key: _,
env,
task_webhook_url,
task_webhook_authorization_header,
task_webhook_url: _,
max_index_size: _,
max_task_db_size: _,
http_payload_size_limit,
@ -346,9 +341,6 @@ impl From<Opt> for Infos {
ignore_snapshot_if_db_exists,
http_addr: http_addr != default_http_addr(),
http_payload_size_limit,
experimental_max_number_of_batched_tasks,
task_queue_webhook: task_webhook_url.is_some(),
task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
log_level: log_level.to_string(),
max_indexing_memory,
max_indexing_threads,
@ -592,11 +584,6 @@ pub struct SearchAggregator {
// vector
// The maximum number of floats in a vector request
max_vector_size: usize,
// Whether the semantic ratio passed to a hybrid search equals the default ratio.
semantic_ratio: bool,
// Whether a non-default embedder was specified
embedder: bool,
hybrid: bool,
// every time a search is done, we increment the counter linked to the used settings
matching_strategy: HashMap<String, usize>,
@ -650,7 +637,6 @@ impl SearchAggregator {
crop_marker,
matching_strategy,
attributes_to_search_on,
hybrid,
} = query;
let mut ret = Self::default();
@ -724,12 +710,6 @@ impl SearchAggregator {
ret.show_ranking_score = *show_ranking_score;
ret.show_ranking_score_details = *show_ranking_score_details;
if let Some(hybrid) = hybrid {
ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO();
ret.embedder = hybrid.embedder.is_some();
ret.hybrid = true;
}
ret
}
@ -783,9 +763,6 @@ impl SearchAggregator {
facets_total_number_of_facets,
show_ranking_score,
show_ranking_score_details,
semantic_ratio,
embedder,
hybrid,
} = other;
if self.timestamp.is_none() {
@ -831,9 +808,6 @@ impl SearchAggregator {
// vector
self.max_vector_size = self.max_vector_size.max(max_vector_size);
self.semantic_ratio |= semantic_ratio;
self.hybrid |= hybrid;
self.embedder |= embedder;
// pagination
self.max_limit = self.max_limit.max(max_limit);
@ -902,9 +876,6 @@ impl SearchAggregator {
facets_total_number_of_facets,
show_ranking_score,
show_ranking_score_details,
semantic_ratio,
embedder,
hybrid,
} = self;
if total_received == 0 {
@ -944,11 +915,6 @@ impl SearchAggregator {
"vector": {
"max_vector_size": max_vector_size,
},
"hybrid": {
"enabled": hybrid,
"semantic_ratio": semantic_ratio,
"embedder": embedder,
},
"pagination": {
"max_limit": max_limit,
"max_offset": max_offset,
@ -1044,7 +1010,6 @@ impl MultiSearchAggregator {
crop_marker: _,
matching_strategy: _,
attributes_to_search_on: _,
hybrid: _,
} = query;
index_uid.as_str()
@ -1191,7 +1156,6 @@ impl FacetSearchAggregator {
filter,
matching_strategy,
attributes_to_search_on,
hybrid,
} = query;
let mut ret = Self::default();
@ -1205,8 +1169,7 @@ impl FacetSearchAggregator {
|| vector.is_some()
|| filter.is_some()
|| *matching_strategy != MatchingStrategy::default()
|| attributes_to_search_on.is_some()
|| hybrid.is_some();
|| attributes_to_search_on.is_some();
ret
}

View File

@ -51,8 +51,6 @@ pub enum MeilisearchHttpError {
DocumentFormat(#[from] DocumentFormatError),
#[error(transparent)]
Join(#[from] JoinError),
#[error("Invalid request: missing `hybrid` parameter when both `q` and `vector` are present.")]
MissingSearchHybrid,
}
impl ErrorCode for MeilisearchHttpError {
@ -76,7 +74,6 @@ impl ErrorCode for MeilisearchHttpError {
MeilisearchHttpError::FileStore(_) => Code::Internal,
MeilisearchHttpError::DocumentFormat(e) => e.error_code(),
MeilisearchHttpError::Join(_) => Code::Internal,
MeilisearchHttpError::MissingSearchHybrid => Code::MissingSearchHybrid,
}
}
}

View File

@ -229,14 +229,12 @@ fn open_or_create_database_unchecked(
snapshots_path: opt.snapshot_dir.clone(),
dumps_path: opt.dump_dir.clone(),
webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()),
webhook_authorization_header: opt.task_webhook_authorization_header.clone(),
task_db_size: opt.max_task_db_size.get_bytes() as usize,
index_base_map_size: opt.max_index_size.get_bytes() as usize,
enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
indexer_config: (&opt.indexer_options).try_into()?,
autobatching_enabled: true,
max_number_of_tasks: 1_000_000,
max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
index_count: DEFAULT_INDEX_COUNT,
instance_features,

View File

@ -19,11 +19,7 @@ static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
/// does all the setup before meilisearch is launched
fn setup(opt: &Opt) -> anyhow::Result<()> {
let mut log_builder = env_logger::Builder::new();
let log_filters = format!(
"{},h2=warn,hyper=warn,tokio_util=warn,tracing=warn,rustls=warn,mio=warn,reqwest=warn",
opt.log_level
);
log_builder.parse_filters(&log_filters);
log_builder.parse_filters(&opt.log_level.to_string());
log_builder.init();

View File

@ -30,7 +30,6 @@ const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR";
const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
const MEILI_ENV: &str = "MEILI_ENV";
const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL";
const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER";
#[cfg(feature = "analytics")]
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
@ -54,8 +53,6 @@ const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
"MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS";
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
const DEFAULT_DB_PATH: &str = "./data.ms";
@ -163,10 +160,6 @@ pub struct Opt {
#[clap(long, env = MEILI_TASK_WEBHOOK_URL)]
pub task_webhook_url: Option<Url>,
/// The Authorization header to send on the webhook URL whenever a task finishes so a third party can be notified.
#[clap(long, env = MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER)]
pub task_webhook_authorization_header: Option<String>,
/// Deactivates Meilisearch's built-in telemetry when provided.
///
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
@ -314,11 +307,6 @@ pub struct Opt {
#[serde(default)]
pub experimental_reduce_indexing_memory_usage: bool,
/// Experimentally reduces the maximum number of tasks that will be processed at once, see: <https://github.com/orgs/meilisearch/discussions/713>
#[clap(long, env = MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())]
#[serde(default = "default_limit_batched_tasks")]
pub experimental_max_number_of_batched_tasks: usize,
#[serde(flatten)]
#[clap(flatten)]
pub indexer_options: IndexerOpts,
@ -387,11 +375,9 @@ impl Opt {
master_key,
env,
task_webhook_url,
task_webhook_authorization_header,
max_index_size: _,
max_task_db_size: _,
http_payload_size_limit,
experimental_max_number_of_batched_tasks,
ssl_cert_path,
ssl_key_path,
ssl_auth_path,
@ -413,8 +399,8 @@ impl Opt {
config_file_path: _,
#[cfg(feature = "analytics")]
no_analytics,
experimental_enable_metrics,
experimental_reduce_indexing_memory_usage,
experimental_enable_metrics: enable_metrics_route,
experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
} = self;
export_to_env_if_not_present(MEILI_DB_PATH, db_path);
export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@ -425,12 +411,6 @@ impl Opt {
if let Some(task_webhook_url) = task_webhook_url {
export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string());
}
if let Some(task_webhook_authorization_header) = task_webhook_authorization_header {
export_to_env_if_not_present(
MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER,
task_webhook_authorization_header,
);
}
#[cfg(feature = "analytics")]
{
@ -440,10 +420,6 @@ impl Opt {
MEILI_HTTP_PAYLOAD_SIZE_LIMIT,
http_payload_size_limit.to_string(),
);
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
experimental_max_number_of_batched_tasks.to_string(),
);
if let Some(ssl_cert_path) = ssl_cert_path {
export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path);
}
@ -468,11 +444,11 @@ impl Opt {
export_to_env_if_not_present(MEILI_LOG_LEVEL, log_level.to_string());
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_ENABLE_METRICS,
experimental_enable_metrics.to_string(),
enable_metrics_route.to_string(),
);
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE,
experimental_reduce_indexing_memory_usage.to_string(),
reduce_indexing_memory_usage.to_string(),
);
indexer_options.export_to_env();
}
@ -762,10 +738,6 @@ fn default_http_payload_size_limit() -> Byte {
Byte::from_str(DEFAULT_HTTP_PAYLOAD_SIZE_LIMIT).unwrap()
}
fn default_limit_batched_tasks() -> usize {
usize::MAX
}
fn default_snapshot_dir() -> PathBuf {
PathBuf::from(DEFAULT_SNAPSHOT_DIR)
}

View File

@ -3,7 +3,7 @@ use std::io::ErrorKind;
use actix_web::http::header::CONTENT_TYPE;
use actix_web::web::Data;
use actix_web::{web, HttpMessage, HttpRequest, HttpResponse};
use bstr::ByteSlice as _;
use bstr::ByteSlice;
use deserr::actix_web::{AwebJson, AwebQueryParameter};
use deserr::Deserr;
use futures::StreamExt;

View File

@ -13,9 +13,9 @@ use crate::analytics::{Analytics, FacetSearchAggregator};
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::search::{
add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, SearchQuery,
DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
add_search_rules, perform_facet_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH,
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
};
pub fn configure(cfg: &mut web::ServiceConfig) {
@ -36,8 +36,6 @@ pub struct FacetSearchQuery {
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchVector>)]
pub vector: Option<Vec<f32>>,
#[deserr(default, error = DeserrJsonError<InvalidHybridQuery>)]
pub hybrid: Option<HybridQuery>,
#[deserr(default, error = DeserrJsonError<InvalidSearchFilter>)]
pub filter: Option<Value>,
#[deserr(default, error = DeserrJsonError<InvalidSearchMatchingStrategy>, default)]
@ -97,7 +95,6 @@ impl From<FacetSearchQuery> for SearchQuery {
filter,
matching_strategy,
attributes_to_search_on,
hybrid,
} = value;
SearchQuery {
@ -122,7 +119,6 @@ impl From<FacetSearchQuery> for SearchQuery {
matching_strategy,
vector,
attributes_to_search_on,
hybrid,
}
}
}

View File

@ -2,14 +2,12 @@ use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse};
use deserr::actix_web::{AwebJson, AwebQueryParameter};
use index_scheduler::IndexScheduler;
use log::{debug, warn};
use log::debug;
use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli;
use meilisearch_types::milli::vector::DistributionShift;
use meilisearch_types::serde_cs::vec::CS;
use serde_json::Value;
@ -18,9 +16,9 @@ use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::search::{
add_search_rules, perform_search, HybridQuery, MatchingStrategy, SearchQuery, SemanticRatio,
DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
add_search_rules, perform_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH,
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
};
pub fn configure(cfg: &mut web::ServiceConfig) {
@ -76,31 +74,6 @@ pub struct SearchQueryGet {
matching_strategy: MatchingStrategy,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToSearchOn>)]
pub attributes_to_search_on: Option<CS<String>>,
#[deserr(default, error = DeserrQueryParamError<InvalidEmbedder>)]
pub hybrid_embedder: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchSemanticRatio>)]
pub hybrid_semantic_ratio: Option<SemanticRatioGet>,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, deserr::Deserr)]
#[deserr(try_from(String) = TryFrom::try_from -> InvalidSearchSemanticRatio)]
pub struct SemanticRatioGet(SemanticRatio);
impl std::convert::TryFrom<String> for SemanticRatioGet {
type Error = InvalidSearchSemanticRatio;
fn try_from(s: String) -> Result<Self, Self::Error> {
let f: f32 = s.parse().map_err(|_| InvalidSearchSemanticRatio)?;
Ok(SemanticRatioGet(SemanticRatio::try_from(f)?))
}
}
impl std::ops::Deref for SemanticRatioGet {
type Target = SemanticRatio;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl From<SearchQueryGet> for SearchQuery {
@ -113,20 +86,6 @@ impl From<SearchQueryGet> for SearchQuery {
None => None,
};
let hybrid = match (other.hybrid_embedder, other.hybrid_semantic_ratio) {
(None, None) => None,
(None, Some(semantic_ratio)) => {
Some(HybridQuery { semantic_ratio: *semantic_ratio, embedder: None })
}
(Some(embedder), None) => Some(HybridQuery {
semantic_ratio: DEFAULT_SEMANTIC_RATIO(),
embedder: Some(embedder),
}),
(Some(embedder), Some(semantic_ratio)) => {
Some(HybridQuery { semantic_ratio: *semantic_ratio, embedder: Some(embedder) })
}
};
Self {
q: other.q,
vector: other.vector.map(CS::into_inner),
@ -149,7 +108,6 @@ impl From<SearchQueryGet> for SearchQuery {
crop_marker: other.crop_marker,
matching_strategy: other.matching_strategy,
attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()),
hybrid,
}
}
}
@ -200,12 +158,8 @@ pub async fn search_with_url_query(
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
let distribution = embed(&mut query, index_scheduler.get_ref(), &index).await?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features, distribution))
.await?;
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result);
}
@ -239,12 +193,8 @@ pub async fn search_with_post(
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
let distribution = embed(&mut query, index_scheduler.get_ref(), &index).await?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features, distribution))
.await?;
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result);
}
@ -256,80 +206,6 @@ pub async fn search_with_post(
Ok(HttpResponse::Ok().json(search_result))
}
pub async fn embed(
query: &mut SearchQuery,
index_scheduler: &IndexScheduler,
index: &milli::Index,
) -> Result<Option<DistributionShift>, ResponseError> {
match (&query.hybrid, &query.vector, &query.q) {
(Some(HybridQuery { semantic_ratio: _, embedder }), None, Some(q))
if !q.trim().is_empty() =>
{
let embedder_configs = index.embedding_configs(&index.read_txn()?)?;
let embedders = index_scheduler.embedders(embedder_configs)?;
let embedder = if let Some(embedder_name) = embedder {
embedders.get(embedder_name)
} else {
embedders.get_default()
};
let embedder = embedder
.ok_or(milli::UserError::InvalidEmbedder("default".to_owned()))
.map_err(milli::Error::from)?
.0;
let distribution = embedder.distribution();
let embeddings = embedder
.embed(vec![q.to_owned()])
.await
.map_err(milli::vector::Error::from)
.map_err(milli::Error::from)?
.pop()
.expect("No vector returned from embedding");
if embeddings.iter().nth(1).is_some() {
warn!("Ignoring embeddings past the first one in long search query");
query.vector = Some(embeddings.iter().next().unwrap().to_vec());
} else {
query.vector = Some(embeddings.into_inner());
}
Ok(distribution)
}
(Some(hybrid), vector, _) => {
let embedder_configs = index.embedding_configs(&index.read_txn()?)?;
let embedders = index_scheduler.embedders(embedder_configs)?;
let embedder = if let Some(embedder_name) = &hybrid.embedder {
embedders.get(embedder_name)
} else {
embedders.get_default()
};
let embedder = embedder
.ok_or(milli::UserError::InvalidEmbedder("default".to_owned()))
.map_err(milli::Error::from)?
.0;
if let Some(vector) = vector {
if vector.len() != embedder.dimensions() {
return Err(meilisearch_types::milli::Error::UserError(
meilisearch_types::milli::UserError::InvalidVectorDimensions {
expected: embedder.dimensions(),
found: vector.len(),
},
)
.into());
}
}
Ok(embedder.distribution())
}
_ => Ok(None),
}
}
#[cfg(test)]
mod test {
use super::*;

View File

@ -7,7 +7,6 @@ use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::ResponseError;
use meilisearch_types::facet_values_sort::FacetValuesSort;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::update::Setting;
use meilisearch_types::settings::{settings, RankingRuleView, Settings, Unchecked};
use meilisearch_types::tasks::KindWithContent;
use serde_json::json;
@ -79,7 +78,6 @@ macro_rules! make_setting_route {
let body = body.into_inner();
#[allow(clippy::redundant_closure_call)]
$analytics(&body, &req);
let new_settings = Settings {
@ -90,11 +88,6 @@ macro_rules! make_setting_route {
..Default::default()
};
let new_settings = $crate::routes::indexes::settings::validate_settings(
new_settings,
&index_scheduler,
)?;
let allow_index_creation =
index_scheduler.filters().allow_index_creation(&index_uid);
@ -441,31 +434,6 @@ make_setting_route!(
}
);
make_setting_route!(
"/proximity-precision",
put,
meilisearch_types::settings::ProximityPrecisionView,
meilisearch_types::deserr::DeserrJsonError<
meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision,
>,
proximity_precision,
"proximityPrecision",
analytics,
|precision: &Option<meilisearch_types::settings::ProximityPrecisionView>, req: &HttpRequest| {
use serde_json::json;
analytics.publish(
"ProximityPrecision Updated".to_string(),
json!({
"proximity_precision": {
"set": precision.is_some(),
"value": precision.unwrap_or_default(),
}
}),
Some(req),
);
}
);
make_setting_route!(
"/ranking-rules",
put,
@ -552,67 +520,6 @@ make_setting_route!(
}
);
make_setting_route!(
"/embedders",
patch,
std::collections::BTreeMap<String, Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>>,
meilisearch_types::deserr::DeserrJsonError<
meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders,
>,
embedders,
"embedders",
analytics,
|setting: &Option<std::collections::BTreeMap<String, Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>>>, req: &HttpRequest| {
analytics.publish(
"Embedders Updated".to_string(),
serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}),
Some(req),
);
}
);
fn embedder_analytics(
setting: Option<
&std::collections::BTreeMap<
String,
Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>,
>,
>,
) -> serde_json::Value {
let mut sources = std::collections::HashSet::new();
if let Some(s) = &setting {
for source in s
.values()
.filter_map(|config| config.clone().set())
.filter_map(|config| config.source.set())
{
use meilisearch_types::milli::vector::settings::EmbedderSource;
match source {
EmbedderSource::OpenAi => sources.insert("openAi"),
EmbedderSource::HuggingFace => sources.insert("huggingFace"),
EmbedderSource::UserProvided => sources.insert("userProvided"),
};
}
};
let document_template_used = setting.as_ref().map(|map| {
map.values()
.filter_map(|config| config.clone().set())
.any(|config| config.document_template.set().is_some())
});
json!(
{
"total": setting.as_ref().map(|s| s.len()),
"sources": sources,
"document_template_used": document_template_used,
}
)
}
macro_rules! generate_configure {
($($mod:ident),*) => {
pub fn configure(cfg: &mut web::ServiceConfig) {
@ -633,7 +540,6 @@ generate_configure!(
displayed_attributes,
searchable_attributes,
distinct_attribute,
proximity_precision,
stop_words,
separator_tokens,
non_separator_tokens,
@ -642,8 +548,7 @@ generate_configure!(
ranking_rules,
typo_tolerance,
pagination,
faceting,
embedders
faceting
);
pub async fn update_all(
@ -656,7 +561,6 @@ pub async fn update_all(
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let new_settings = body.into_inner();
let new_settings = validate_settings(new_settings, &index_scheduler)?;
analytics.publish(
"Settings Updated".to_string(),
@ -689,10 +593,6 @@ pub async fn update_all(
"distinct_attribute": {
"set": new_settings.distinct_attribute.as_ref().set().is_some()
},
"proximity_precision": {
"set": new_settings.proximity_precision.as_ref().set().is_some(),
"value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default()
},
"typo_tolerance": {
"enabled": new_settings.typo_tolerance
.as_ref()
@ -752,7 +652,6 @@ pub async fn update_all(
"synonyms": {
"total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()),
},
"embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set())
}),
Some(&req),
);
@ -807,13 +706,3 @@ pub async fn delete_all(
debug!("returns: {:?}", task);
Ok(HttpResponse::Accepted().json(task))
}
fn validate_settings(
settings: Settings<Unchecked>,
index_scheduler: &IndexScheduler,
) -> Result<Settings<Unchecked>, ResponseError> {
if matches!(settings.embedders, Setting::Set(_)) {
index_scheduler.features().check_vector("Passing `embedders` in settings")?
}
Ok(settings.validate()?)
}

View File

@ -13,7 +13,6 @@ use crate::analytics::{Analytics, MultiSearchAggregator};
use crate::extractors::authentication::policies::ActionPolicy;
use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::indexes::search::embed;
use crate::search::{
add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex,
};
@ -47,51 +46,49 @@ pub async fn multi_search_with_post(
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code
// changes.
let search_results: Result<_, (ResponseError, usize)> = async {
let mut search_results = Vec::with_capacity(queries.len());
for (query_index, (index_uid, mut query)) in
queries.into_iter().map(SearchQueryWithIndex::into_index_query).enumerate()
{
debug!("multi-search #{query_index}: called with params: {:?}", query);
// Check index from API key
if !index_scheduler.filters().is_index_authorized(&index_uid) {
return Err(AuthenticationError::InvalidToken).with_index(query_index);
}
// Apply search rules from tenant token
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid)
let search_results: Result<_, (ResponseError, usize)> = (|| {
async {
let mut search_results = Vec::with_capacity(queries.len());
for (query_index, (index_uid, mut query)) in
queries.into_iter().map(SearchQueryWithIndex::into_index_query).enumerate()
{
add_search_rules(&mut query, search_rules);
debug!("multi-search #{query_index}: called with params: {:?}", query);
// Check index from API key
if !index_scheduler.filters().is_index_authorized(&index_uid) {
return Err(AuthenticationError::InvalidToken).with_index(query_index);
}
// Apply search rules from tenant token
if let Some(search_rules) =
index_scheduler.filters().get_index_search_rules(&index_uid)
{
add_search_rules(&mut query, search_rules);
}
let index = index_scheduler
.index(&index_uid)
.map_err(|err| {
let mut err = ResponseError::from(err);
// Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but
// here the resource not found is not part of the URL.
err.code = StatusCode::BAD_REQUEST;
err
})
.with_index(query_index)?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features))
.await
.with_index(query_index)?;
search_results.push(SearchResultWithIndex {
index_uid: index_uid.into_inner(),
result: search_result.with_index(query_index)?,
});
}
let index = index_scheduler
.index(&index_uid)
.map_err(|err| {
let mut err = ResponseError::from(err);
// Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but
// here the resource not found is not part of the URL.
err.code = StatusCode::BAD_REQUEST;
err
})
.with_index(query_index)?;
let distribution = embed(&mut query, index_scheduler.get_ref(), &index)
.await
.with_index(query_index)?;
let search_result = tokio::task::spawn_blocking(move || {
perform_search(&index, query, features, distribution)
})
.await
.with_index(query_index)?;
search_results.push(SearchResultWithIndex {
index_uid: index_uid.into_inner(),
result: search_result.with_index(query_index)?,
});
Ok(search_results)
}
Ok(search_results)
}
})()
.await;
if search_results.is_ok() {

View File

@ -7,21 +7,24 @@ use deserr::Deserr;
use either::Either;
use index_scheduler::RoFeatures;
use indexmap::IndexMap;
use log::warn;
use meilisearch_auth::IndexSearchRules;
use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::score_details::{self, ScoreDetails, ScoringStrategy};
use meilisearch_types::milli::vector::DistributionShift;
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues};
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
use meilisearch_types::milli::{
dot_product_similarity, FacetValueHit, InternalError, OrderBy, SearchForFacetValues,
};
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
use meilisearch_types::{milli, Document};
use milli::tokenizer::TokenizerBuilder;
use milli::{
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder,
SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
SortError, TermsMatchingStrategy, VectorOrArrayOfVectors, DEFAULT_VALUES_PER_FACET,
};
use ordered_float::OrderedFloat;
use regex::Regex;
use serde::Serialize;
use serde_json::{json, Value};
@ -36,7 +39,6 @@ pub const DEFAULT_CROP_LENGTH: fn() -> usize = || 10;
pub const DEFAULT_CROP_MARKER: fn() -> String = || "…".to_string();
pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5);
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
@ -45,8 +47,6 @@ pub struct SearchQuery {
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchVector>)]
pub vector: Option<Vec<f32>>,
#[deserr(default, error = DeserrJsonError<InvalidHybridQuery>)]
pub hybrid: Option<HybridQuery>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@ -87,48 +87,6 @@ pub struct SearchQuery {
pub attributes_to_search_on: Option<Vec<String>>,
}
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)]
pub struct HybridQuery {
/// TODO validate that sementic ratio is between 0.0 and 1,0
#[deserr(default, error = DeserrJsonError<InvalidSearchSemanticRatio>, default)]
pub semantic_ratio: SemanticRatio,
#[deserr(default, error = DeserrJsonError<InvalidEmbedder>, default)]
pub embedder: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Deserr)]
#[deserr(try_from(f32) = TryFrom::try_from -> InvalidSearchSemanticRatio)]
pub struct SemanticRatio(f32);
impl Default for SemanticRatio {
fn default() -> Self {
DEFAULT_SEMANTIC_RATIO()
}
}
impl std::convert::TryFrom<f32> for SemanticRatio {
type Error = InvalidSearchSemanticRatio;
fn try_from(f: f32) -> Result<Self, Self::Error> {
// the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable
#[allow(clippy::manual_range_contains)]
if f > 1.0 || f < 0.0 {
Err(InvalidSearchSemanticRatio)
} else {
Ok(SemanticRatio(f))
}
}
}
impl std::ops::Deref for SemanticRatio {
type Target = f32;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl SearchQuery {
pub fn is_finite_pagination(&self) -> bool {
self.page.or(self.hits_per_page).is_some()
@ -148,8 +106,6 @@ pub struct SearchQueryWithIndex {
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub vector: Option<Vec<f32>>,
#[deserr(default, error = DeserrJsonError<InvalidHybridQuery>)]
pub hybrid: Option<HybridQuery>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@ -215,7 +171,6 @@ impl SearchQueryWithIndex {
crop_marker,
matching_strategy,
attributes_to_search_on,
hybrid,
} = self;
(
index_uid,
@ -241,7 +196,6 @@ impl SearchQueryWithIndex {
crop_marker,
matching_strategy,
attributes_to_search_on,
hybrid,
// do not use ..Default::default() here,
// rather add any missing field from `SearchQuery` to `SearchQueryWithIndex`
},
@ -381,44 +335,19 @@ fn prepare_search<'t>(
rtxn: &'t RoTxn,
query: &'t SearchQuery,
features: RoFeatures,
distribution: Option<DistributionShift>,
) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> {
let mut search = index.search(rtxn);
if query.vector.is_some() {
features.check_vector("Passing `vector` as a query parameter")?;
if query.vector.is_some() && query.q.is_some() {
warn!("Ignoring the query string `q` when used with the `vector` parameter.");
}
if query.hybrid.is_some() {
features.check_vector("Passing `hybrid` as a query parameter")?;
}
if query.hybrid.is_none() && query.q.is_some() && query.vector.is_some() {
return Err(MeilisearchHttpError::MissingSearchHybrid);
}
search.distribution_shift(distribution);
if let Some(ref vector) = query.vector {
match &query.hybrid {
// If semantic ratio is 0.0, only the query search will impact the search results,
// skip the vector
Some(hybrid) if *hybrid.semantic_ratio == 0.0 => (),
_otherwise => {
search.vector(vector.clone());
}
}
search.vector(vector.clone());
}
if let Some(ref q) = query.q {
match &query.hybrid {
// If semantic ratio is 1.0, only the vector search will impact the search results,
// skip the query
Some(hybrid) if *hybrid.semantic_ratio == 1.0 => (),
_otherwise => {
search.query(q);
}
}
if let Some(ref query) = query.q {
search.query(query);
}
if let Some(ref searchable) = query.attributes_to_search_on {
@ -431,7 +360,6 @@ fn prepare_search<'t>(
let max_total_hits = index
.pagination_max_total_hits(rtxn)
.map_err(milli::Error::from)?
.map(|x| x as usize)
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS);
search.exhaustive_number_hits(is_finite_pagination);
@ -445,8 +373,8 @@ fn prepare_search<'t>(
features.check_score_details()?;
}
if let Some(HybridQuery { embedder: Some(embedder), .. }) = &query.hybrid {
search.embedder_name(embedder);
if query.vector.is_some() {
features.check_vector()?;
}
// compute the offset on the limit depending on the pagination mode.
@ -492,22 +420,15 @@ pub fn perform_search(
index: &Index,
query: SearchQuery,
features: RoFeatures,
distribution: Option<DistributionShift>,
) -> Result<SearchResult, MeilisearchHttpError> {
let before_search = Instant::now();
let rtxn = index.read_txn()?;
let (search, is_finite_pagination, max_total_hits, offset) =
prepare_search(index, &rtxn, &query, features, distribution)?;
prepare_search(index, &rtxn, &query, features)?;
let milli::SearchResult { documents_ids, matching_words, candidates, document_scores, .. } =
match &query.hybrid {
Some(hybrid) => match *hybrid.semantic_ratio {
ratio if ratio == 0.0 || ratio == 1.0 => search.execute()?,
ratio => search.execute_hybrid(ratio)?,
},
None => search.execute()?,
};
search.execute()?;
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
@ -616,17 +537,13 @@ pub fn perform_search(
insert_geo_distance(sort, &mut document);
}
let mut semantic_score = None;
for details in &score {
if let ScoreDetails::Vector(score_details::Vector {
target_vector: _,
value_similarity: Some((_matching_vector, similarity)),
}) = details
{
semantic_score = Some(*similarity);
break;
}
}
let semantic_score = match query.vector.as_ref() {
Some(vector) => match extract_field("_vectors", &fields_ids_map, obkv)? {
Some(vectors) => compute_semantic_score(vector, vectors)?,
None => None,
},
None => None,
};
let ranking_score =
query.show_ranking_score.then(|| ScoreDetails::global_score(score.iter()));
@ -669,7 +586,6 @@ pub fn perform_search(
let max_values_by_facet = index
.max_values_per_facet(&rtxn)
.map_err(milli::Error::from)?
.map(|x| x as usize)
.unwrap_or(DEFAULT_VALUES_PER_FACET);
facet_distribution.max_values_per_facet(max_values_by_facet);
@ -729,15 +645,11 @@ pub fn perform_facet_search(
let before_search = Instant::now();
let rtxn = index.read_txn()?;
let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features, None)?;
let mut facet_search =
SearchForFacetValues::new(facet_name, search, search_query.hybrid.is_some());
let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features)?;
let mut facet_search = SearchForFacetValues::new(facet_name, search);
if let Some(facet_query) = &facet_query {
facet_search.query(facet_query);
}
if let Some(max_facets) = index.max_values_per_facet(&rtxn)? {
facet_search.max_values(max_facets as usize);
}
Ok(FacetSearchResult {
facet_hits: facet_search.execute()?,
@ -762,6 +674,18 @@ fn insert_geo_distance(sorts: &[String], document: &mut Document) {
}
}
fn compute_semantic_score(query: &[f32], vectors: Value) -> milli::Result<Option<f32>> {
let vectors = serde_json::from_value(vectors)
.map(VectorOrArrayOfVectors::into_array_of_vectors)
.map_err(InternalError::SerdeJson)?;
Ok(vectors
.into_iter()
.flatten()
.map(|v| OrderedFloat(dot_product_similarity(query, &v)))
.max()
.map(OrderedFloat::into_inner))
}
fn compute_formatted_options(
attr_to_highlight: &HashSet<String>,
attr_to_crop: &[String],
@ -889,6 +813,22 @@ fn make_document(
Ok(document)
}
/// Extract the JSON value under the field name specified
/// but doesn't support nested objects.
fn extract_field(
field_name: &str,
field_ids_map: &FieldsIdsMap,
obkv: obkv::KvReaderU16,
) -> Result<Option<serde_json::Value>, MeilisearchHttpError> {
match field_ids_map.id(field_name) {
Some(fid) => match obkv.get(fid) {
Some(value) => Ok(serde_json::from_slice(value).map(Some)?),
None => Ok(None),
},
None => Ok(None),
}
}
fn format_fields<'a>(
document: &Document,
field_ids_map: &FieldsIdsMap,
@ -900,14 +840,6 @@ fn format_fields<'a>(
let mut matches_position = compute_matches.then(BTreeMap::new);
let mut document = document.clone();
// reduce the formatted option list to the attributes that should be formatted,
// instead of all the attributes to display.
let formatting_fields_options: Vec<_> = formatted_options
.iter()
.filter(|(_, option)| option.should_format())
.map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option))
.collect();
// select the attributes to retrieve
let displayable_names =
displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
@ -916,15 +848,13 @@ fn format_fields<'a>(
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
// highlighted.
// Warn: The time to compute the format list scales with the number of fields to format;
// cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity:
// d*f where d is the total number of fields to display and f is the total number of fields to format.
let format = formatting_fields_options
let format = formatted_options
.iter()
.filter(|(name, _option)| {
.filter(|(field, _option)| {
let name = field_ids_map.name(**field).unwrap();
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
})
.map(|(_, option)| **option)
.map(|(_, option)| *option)
.reduce(|acc, option| acc.merge(option));
let mut infos = Vec::new();
@ -1021,7 +951,7 @@ fn format_value<'a>(
let value = matcher.format(format_options);
Value::String(value.into_owned())
}
None => Value::String(s),
None => Value::Number(number),
}
}
value => value,

View File

@ -20,8 +20,6 @@ pub enum GetDump {
RubyGemsWithSettingsV4,
TestV5,
TestV6WithExperimental,
}
impl GetDump {
@ -70,10 +68,6 @@ impl GetDump {
GetDump::TestV5 => {
exist_relative_path!("tests/assets/v5_v0.28.0_test_dump.dump").into()
}
GetDump::TestV6WithExperimental => exist_relative_path!(
"tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump"
)
.into(),
}
}
}

View File

@ -59,7 +59,6 @@ async fn import_dump_v1_movie_raw() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -220,7 +219,6 @@ async fn import_dump_v1_movie_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -367,7 +365,6 @@ async fn import_dump_v1_rubygems_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -500,7 +497,6 @@ async fn import_dump_v2_movie_raw() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -645,7 +641,6 @@ async fn import_dump_v2_movie_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -789,7 +784,6 @@ async fn import_dump_v2_rubygems_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -922,7 +916,6 @@ async fn import_dump_v3_movie_raw() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -1067,7 +1060,6 @@ async fn import_dump_v3_movie_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -1211,7 +1203,6 @@ async fn import_dump_v3_rubygems_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -1344,7 +1335,6 @@ async fn import_dump_v4_movie_raw() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -1489,7 +1479,6 @@ async fn import_dump_v4_movie_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -1633,7 +1622,6 @@ async fn import_dump_v4_rubygems_with_settings() {
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
@ -1822,107 +1810,3 @@ async fn import_dump_v5() {
json_string!(tasks, { ".results[].details.dumpUid" => "[uid]", ".results[].duration" => "[duration]" , ".results[].startedAt" => "[date]" , ".results[].finishedAt" => "[date]" })
);
}
#[actix_rt::test]
async fn import_dump_v6_containing_experimental_features() {
let temp = tempfile::tempdir().unwrap();
let options = Opt {
import_dump: Some(GetDump::TestV6WithExperimental.path()),
..default_settings(temp.path())
};
let mut server = Server::new_auth_with_options(options, temp).await;
server.use_api_key("MASTER_KEY");
let (indexes, code) = server.list_indexes(None, None).await;
assert_eq!(code, 200, "{indexes}");
assert_eq!(indexes["results"].as_array().unwrap().len(), 1);
assert_eq!(indexes["results"][0]["uid"], json!("movies"));
assert_eq!(indexes["results"][0]["primaryKey"], json!("id"));
let (response, code) = server.get_features().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": false,
"metrics": false,
"exportPuffinReports": false
}
"###);
let index = server.index("movies");
let (response, code) = index.settings().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"words",
"typo",
"proximity"
],
"stopWords": [],
"nonSeparatorTokens": [],
"separatorTokens": [],
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byAttribute",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
"oneTypo": 5,
"twoTypos": 9
},
"disableOnWords": [],
"disableOnAttributes": []
},
"faceting": {
"maxValuesPerFacet": 100,
"sortFacetValuesBy": {
"*": "alpha"
}
},
"pagination": {
"maxTotalHits": 1000
}
}
"###);
// the expected order is [1, 3, 2] instead of [3, 1, 2]
// because the attribute scale doesn't make the difference between 1 and 3.
index
.search(json!({"q": "the soup of day"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
}
]
"###);
})
.await;
}

View File

@ -4,7 +4,7 @@ use once_cell::sync::Lazy;
use crate::common::{Server, Value};
use crate::json;
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"id": 1,
@ -107,8 +107,8 @@ static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
])
});
static DOCUMENT_PRIMARY_KEY: &str = "id";
static DOCUMENT_DISTINCT_KEY: &str = "product_id";
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "id";
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "product_id";
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
#[actix_rt::test]

View File

@ -4,7 +4,7 @@ use once_cell::sync::Lazy;
use crate::common::{Server, Value};
use crate::json;
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Shazam!",
@ -105,24 +105,6 @@ async fn more_advanced_facet_search() {
snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
}
#[actix_rt::test]
async fn simple_facet_search_with_max_values() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await;
index.update_settings_filterable_attributes(json!(["genres"])).await;
index.add_documents(documents, None).await;
index.wait_task(2).await;
let (response, code) =
index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await;
assert_eq!(code, 200, "{}", response);
assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1);
}
#[actix_rt::test]
async fn non_filterable_facet_search_error() {
let server = Server::new().await;

View File

@ -4,7 +4,7 @@ use once_cell::sync::Lazy;
use crate::common::{Server, Value};
use crate::json;
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"id": 1,

View File

@ -1,175 +0,0 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy;
use crate::common::index::Index;
use crate::common::{Server, Value};
use crate::json;
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
let index = server.index("test");
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
}
"###);
let (response, code) = index
.update_settings(json!({ "embedders": {"default": {
"source": "userProvided",
"dimensions": 2}}} ))
.await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
let (response, code) = index.add_documents(documents.clone(), None).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
index
}
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1",
"_vectors": {"default": [1.0, 3.0]},
},
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2",
"_vectors": {"default": [1.0, 2.0]},
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3",
"_vectors": {"default": [2.0, 3.0]},
}])
});
static SINGLE_DOCUMENT: Lazy<Value> = Lazy::new(|| {
json!([{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1",
"_vectors": {"default": [1.0, 3.0]},
}])
});
#[actix_rt::test]
async fn simple_search() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###);
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###);
}
#[actix_rt::test]
async fn invalid_semantic_ratio() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.2}}),
)
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Invalid value at `.hybrid.semanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
"code": "invalid_search_semantic_ratio",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
}
"###);
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": -0.8}}),
)
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Invalid value at `.hybrid.semanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
"code": "invalid_search_semantic_ratio",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
}
"###);
let (response, code) = index
.search_get(
&yaup::to_string(
&json!({"q": "Captain", "vector": [1.0, 1.0], "hybridSemanticRatio": 1.2}),
)
.unwrap(),
)
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Invalid value in parameter `hybridSemanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
"code": "invalid_search_semantic_ratio",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
}
"###);
let (response, code) = index
.search_get(
&yaup::to_string(
&json!({"q": "Captain", "vector": [1.0, 1.0], "hybridSemanticRatio": -0.2}),
)
.unwrap(),
)
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Invalid value in parameter `hybridSemanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
"code": "invalid_search_semantic_ratio",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
}
"###);
}
#[actix_rt::test]
async fn single_document() {
let server = Server::new().await;
let index = index_with_documents(&server, &SINGLE_DOCUMENT).await;
let (response, code) = index
.search_post(
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###);
}

View File

@ -6,7 +6,6 @@ mod errors;
mod facet_search;
mod formatted;
mod geo;
mod hybrid;
mod multi;
mod pagination;
mod restrict_searchable;
@ -16,37 +15,32 @@ use once_cell::sync::Lazy;
use crate::common::{Server, Value};
use crate::json;
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Shazam!",
"id": "287947",
"_vectors": { "manual": [1, 2, 3]},
},
{
"title": "Captain Marvel",
"id": "299537",
"_vectors": { "manual": [1, 2, 54] },
},
{
"title": "Escape Room",
"id": "522681",
"_vectors": { "manual": [10, -23, 32] },
},
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428",
"_vectors": { "manual": [-100, 231, 32] },
},
{
"title": "Gläss",
"id": "450465",
"_vectors": { "manual": [-100, 340, 90] },
}
])
});
static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
pub(self) static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"id": 852,
@ -63,7 +57,6 @@ static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
},
],
"cattos": "pésti",
"_vectors": { "manual": [1, 2, 3]},
},
{
"id": 654,
@ -76,14 +69,12 @@ static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
},
],
"cattos": ["simba", "pestiféré"],
"_vectors": { "manual": [1, 2, 54] },
},
{
"id": 750,
"father": "romain",
"mother": "michelle",
"cattos": ["enigma"],
"_vectors": { "manual": [10, 23, 32] },
},
{
"id": 951,
@ -100,7 +91,6 @@ static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
},
],
"cattos": ["moumoute", "gomez"],
"_vectors": { "manual": [10, 23, 32] },
},
])
});
@ -812,13 +802,6 @@ async fn experimental_feature_score_details() {
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
]
},
"_rankingScoreDetails": {
"words": {
"order": 0,
@ -840,7 +823,7 @@ async fn experimental_feature_score_details() {
"order": 3,
"attributeRankingOrderScore": 1.0,
"queryWordDistanceScore": 0.8095238095238095,
"score": 0.9727891156462584
"score": 0.9365079365079364
},
"exactness": {
"order": 4,
@ -887,100 +870,13 @@ async fn experimental_feature_vector_store() {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response["vectorStore"], @"true");
let (response, code) = index
.update_settings(json!({"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
}}))
.await;
meili_snap::snapshot!(response, @r###"
{
"taskUid": 1,
"indexUid": "test",
"status": "enqueued",
"type": "settingsUpdate",
"enqueuedAt": "[date]"
}
"###);
meili_snap::snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
meili_snap::snapshot!(meili_snap::json_string!(response["status"]), @"\"succeeded\"");
let (response, code) = index
.search_post(json!({
"vector": [1.0, 2.0, 3.0],
}))
.await;
meili_snap::snapshot!(code, @"200 OK");
// vector search returns all documents that don't have vectors in the last bucket, like all sorts
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"title": "Shazam!",
"id": "287947",
"_vectors": {
"manual": [
1,
2,
3
]
},
"_semanticScore": 1.0
},
{
"title": "Captain Marvel",
"id": "299537",
"_vectors": {
"manual": [
1,
2,
54
]
},
"_semanticScore": 0.9129112
},
{
"title": "Gläss",
"id": "450465",
"_vectors": {
"manual": [
-100,
340,
90
]
},
"_semanticScore": 0.8106413
},
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
]
},
"_semanticScore": 0.74120104
},
{
"title": "Escape Room",
"id": "522681",
"_vectors": {
"manual": [
10,
-23,
32
]
}
}
]
"###);
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @"[]");
}
#[cfg(feature = "default")]
@ -1230,14 +1126,7 @@ async fn simple_search_with_strange_synonyms() {
[
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
]
}
"id": "166428"
}
]
"###);
@ -1251,14 +1140,7 @@ async fn simple_search_with_strange_synonyms() {
[
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
]
}
"id": "166428"
}
]
"###);
@ -1272,14 +1154,7 @@ async fn simple_search_with_strange_synonyms() {
[
{
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
]
}
"id": "166428"
}
]
"###);

View File

@ -72,14 +72,7 @@ async fn simple_search_single_index() {
"hits": [
{
"title": "Gläss",
"id": "450465",
"_vectors": {
"manual": [
-100,
340,
90
]
}
"id": "450465"
}
],
"query": "glass",
@ -93,14 +86,7 @@ async fn simple_search_single_index() {
"hits": [
{
"title": "Captain Marvel",
"id": "299537",
"_vectors": {
"manual": [
1,
2,
54
]
}
"id": "299537"
}
],
"query": "captain",
@ -191,14 +177,7 @@ async fn simple_search_two_indexes() {
"hits": [
{
"title": "Gläss",
"id": "450465",
"_vectors": {
"manual": [
-100,
340,
90
]
}
"id": "450465"
}
],
"query": "glass",
@ -224,14 +203,7 @@ async fn simple_search_two_indexes() {
"age": 4
}
],
"cattos": "pésti",
"_vectors": {
"manual": [
1,
2,
3
]
}
"cattos": "pésti"
},
{
"id": 654,
@ -246,14 +218,7 @@ async fn simple_search_two_indexes() {
"cattos": [
"simba",
"pestiféré"
],
"_vectors": {
"manual": [
1,
2,
54
]
}
]
}
],
"query": "pésti",

View File

@ -335,35 +335,3 @@ async fn exactness_ranking_rule_order() {
})
.await;
}
#[actix_rt::test]
async fn search_on_exact_field() {
let server = Server::new().await;
let index = index_with_documents(
&server,
&json!([
{
"title": "Captain Marvel",
"exact": "Captain Marivel",
"id": "1",
},
{
"title": "Captain Marivel",
"exact": "Captain the Marvel",
"id": "2",
}]),
)
.await;
let (response, code) =
index.update_settings_typo_tolerance(json!({ "disableOnAttributes": ["exact"] })).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(1).await;
// Searching on an exact attribute should only return the document matching without typo.
index
.search(json!({"q": "Marvel", "attributesToSearchOn": ["exact"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"].as_array().unwrap().len(), @"1");
})
.await;
}

View File

@ -54,7 +54,7 @@ async fn get_settings() {
let (response, code) = index.settings().await;
assert_eq!(code, 200);
let settings = response.as_object().unwrap();
assert_eq!(settings.keys().len(), 15);
assert_eq!(settings.keys().len(), 14);
assert_eq!(settings["displayedAttributes"], json!(["*"]));
assert_eq!(settings["searchableAttributes"], json!(["*"]));
assert_eq!(settings["filterableAttributes"], json!([]));
@ -83,7 +83,6 @@ async fn get_settings() {
"maxTotalHits": 1000,
})
);
assert_eq!(settings["proximityPrecision"], json!("byWord"));
}
#[actix_rt::test]

View File

@ -1,5 +1,4 @@
mod distinct;
mod errors;
mod get_settings;
mod proximity_settings;
mod tokenizer_customization;

View File

@ -1,352 +0,0 @@
use meili_snap::{json_string, snapshot};
use once_cell::sync::Lazy;
use crate::common::Server;
use crate::json;
static DOCUMENTS: Lazy<crate::common::Value> = Lazy::new(|| {
json!([
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish",
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish",
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish",
},
])
});
#[actix_rt::test]
async fn attribute_scale_search() {
let server = Server::new().await;
let index = server.index("test");
index.add_documents(DOCUMENTS.clone(), None).await;
index.wait_task(0).await;
let (response, code) = index
.update_settings(json!({
"proximityPrecision": "byAttribute",
"rankingRules": ["words", "typo", "proximity"],
}))
.await;
assert_eq!("202", code.as_str(), "{:?}", response);
index.wait_task(1).await;
// the expected order is [1, 3, 2] instead of [3, 1, 2]
// because the attribute scale doesn't make the difference between 1 and 3.
index
.search(json!({"q": "the soup of day"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
}
]
"###);
})
.await;
// the expected order is [1, 2, 3] instead of [1, 3, 2]
// because the attribute scale sees all the word in the same attribute
// and so doesn't make the difference between the documents.
index
.search(json!({"q": "many the fish"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
}
]
"###);
})
.await;
}
#[actix_rt::test]
async fn attribute_scale_phrase_search() {
let server = Server::new().await;
let index = server.index("test");
index.add_documents(DOCUMENTS.clone(), None).await;
index.wait_task(0).await;
let (_response, _code) = index
.update_settings(json!({
"proximityPrecision": "byAttribute",
"rankingRules": ["words", "typo", "proximity"],
}))
.await;
index.wait_task(1).await;
// the expected order is [1, 3] instead of [3, 1]
// because the attribute scale doesn't make the difference between 1 and 3.
// But 2 shouldn't be returned because "the" is not in the same attribute.
index
.search(json!({"q": "\"the soup of day\""}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
}
]
"###);
})
.await;
// the expected order is [1, 2, 3] instead of [1, 3]
// because the attribute scale sees all the word in the same attribute
// and so doesn't make the difference between the documents.
index
.search(json!({"q": "\"many the fish\""}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
}
]
"###);
})
.await;
}
#[actix_rt::test]
async fn word_scale_set_and_reset() {
let server = Server::new().await;
let index = server.index("test");
index.add_documents(DOCUMENTS.clone(), None).await;
index.wait_task(0).await;
// Set and reset the setting ensuring the swap between the 2 settings is applied.
let (_response, _code) = index
.update_settings(json!({
"proximityPrecision": "byAttribute",
"rankingRules": ["words", "typo", "proximity"],
}))
.await;
index.wait_task(1).await;
let (_response, _code) = index
.update_settings(json!({
"proximityPrecision": "byWord",
"rankingRules": ["words", "typo", "proximity"],
}))
.await;
index.wait_task(2).await;
// [3, 1, 2]
index
.search(json!({"q": "the soup of day"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
},
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
}
]
"###);
})
.await;
// [1, 3, 2]
index
.search(json!({"q": "many the fish"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
}
]
"###);
})
.await;
// [3]
index
.search(json!({"q": "\"the soup of day\""}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
}
]
"###);
})
.await;
// [1, 3]
index
.search(json!({"q": "\"many the fish\""}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
}
]
"###);
})
.await;
}
#[actix_rt::test]
async fn attribute_scale_default_ranking_rules() {
let server = Server::new().await;
let index = server.index("test");
index.add_documents(DOCUMENTS.clone(), None).await;
index.wait_task(0).await;
let (response, code) = index
.update_settings(json!({
"proximityPrecision": "byAttribute"
}))
.await;
assert_eq!("202", code.as_str(), "{:?}", response);
index.wait_task(1).await;
// the expected order is [3, 1, 2]
index
.search(json!({"q": "the soup of day"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
},
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
}
]
"###);
})
.await;
// the expected order is [1, 3, 2] instead of [1, 3]
// because the attribute scale sees all the word in the same attribute
// and so doesn't remove the document 2.
index
.search(json!({"q": "\"many the fish\""}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"id": 1,
"a": "Soup of the day",
"b": "many the fish"
},
{
"id": 3,
"a": "the Soup of day",
"b": "many the fish"
},
{
"id": 2,
"a": "Soup of day",
"b": "many the lazy fish"
}
]
"###);
})
.await;
}

View File

@ -7,8 +7,8 @@ use clap::{Parser, Subcommand};
use dump::{DumpWriter, IndexMetadata};
use file_store::FileStore;
use meilisearch_auth::AuthController;
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
use meilisearch_types::heed::types::{OwnedType, SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, PolyDatabase, RoTxn, RwTxn};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
use meilisearch_types::milli::{obkv_to_json, BEU32};
use meilisearch_types::tasks::{Status, Task};
@ -148,17 +148,15 @@ fn try_opening_poly_database(
env: &Env,
rtxn: &RoTxn,
db_name: &str,
) -> anyhow::Result<Database<Unspecified, Unspecified>> {
env.database_options()
.name(db_name)
.open(rtxn)
) -> anyhow::Result<PolyDatabase> {
env.open_poly_database(rtxn, Some(db_name))
.with_context(|| format!("While opening the {db_name:?} poly database"))?
.with_context(|| format!("Missing the {db_name:?} poly database"))
}
fn try_clearing_poly_database(
wtxn: &mut RwTxn,
database: Database<Unspecified, Unspecified>,
database: PolyDatabase,
db_name: &str,
) -> anyhow::Result<()> {
database.clear(wtxn).with_context(|| format!("While clearing the {db_name:?} database"))
@ -214,7 +212,7 @@ fn export_a_dump(
eprintln!("Successfully dumped {count} keys!");
let rtxn = env.read_txn()?;
let all_tasks: Database<BEU32, SerdeJson<Task>> =
let all_tasks: Database<OwnedType<BEU32>, SerdeJson<Task>> =
try_opening_database(&env, &rtxn, "all-tasks")?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &rtxn, "index-mapping")?;

View File

@ -1,7 +1,7 @@
use std::borrow::Cow;
use std::convert::TryInto;
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
use meilisearch_types::heed::{BytesDecode, BytesEncode};
use uuid::Uuid;
/// A heed codec for value of struct Uuid.
@ -10,15 +10,15 @@ pub struct UuidCodec;
impl<'a> BytesDecode<'a> for UuidCodec {
type DItem = Uuid;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
bytes.try_into().map(Uuid::from_bytes).map_err(Into::into)
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
bytes.try_into().ok().map(Uuid::from_bytes)
}
}
impl BytesEncode<'_> for UuidCodec {
type EItem = Uuid;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
Ok(Cow::Borrowed(item.as_bytes()))
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
Some(Cow::Borrowed(item.as_bytes()))
}
}

View File

@ -20,22 +20,20 @@ byteorder = "1.4.3"
charabia = { version = "0.8.5", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.8"
deserr = "0.6.0"
deserr = { version = "0.6.0", features = ["actix-web"]}
either = { version = "1.8.1", features = ["serde"] }
flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7"
fxhash = "0.2.1"
geoutils = "0.5.1"
grenad = { version = "0.4.5", default-features = false, features = [
"rayon",
"tempfile",
"rayon", "tempfile"
] }
heed = { version = "0.20.0-alpha.9", default-features = false, features = [
"serde-json",
"serde-bincode",
"read-txn-no-tls",
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
"lmdb", "read-txn-no-tls"
] }
indexmap = { version = "2.0.0", features = ["serde"] }
instant-distance = { version = "0.6.1", features = ["with-serde"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memmap2 = "0.7.1"
@ -74,23 +72,6 @@ puffin = "0.16.0"
log = "0.4.17"
logging_timer = "1.1.0"
csv = "1.2.1"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] }
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
"online",
] }
tokio = { version = "1.34.0", features = ["rt"] }
futures = "0.3.29"
reqwest = { version = "0.11.16", features = [
"rustls-tls",
"json",
], default-features = false }
tiktoken-rs = "0.5.7"
liquid = "0.26.4"
arroy = { git = "https://github.com/meilisearch/arroy.git", version = "0.1.0" }
rand = "0.8.5"
[dev-dependencies]
mimalloc = { version = "0.1.37", default-features = false }
@ -102,15 +83,7 @@ meili-snap = { path = "../meili-snap" }
rand = { version = "0.8.5", features = ["small_rng"] }
[features]
all-tokenizations = [
"charabia/chinese",
"charabia/hebrew",
"charabia/japanese",
"charabia/thai",
"charabia/korean",
"charabia/greek",
"charabia/khmer",
]
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"]
# Use POSIX semaphores instead of SysV semaphores in LMDB
# For more information on this feature, see heed's Cargo.toml

View File

@ -5,8 +5,8 @@ use std::time::Instant;
use heed::EnvOpenOptions;
use milli::{
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext,
SearchLogger, TermsMatchingStrategy,
execute_search, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext, SearchLogger,
TermsMatchingStrategy,
};
#[global_allocator]
@ -49,15 +49,14 @@ fn main() -> Result<(), Box<dyn Error>> {
let start = Instant::now();
let mut ctx = SearchContext::new(&index, &txn);
let universe = filtered_universe(&ctx, &None)?;
let docs = execute_search(
&mut ctx,
(!query.trim().is_empty()).then(|| query.trim()),
&(!query.trim().is_empty()).then(|| query.trim().to_owned()),
&None,
TermsMatchingStrategy::Last,
milli::score_details::ScoringStrategy::Skip,
false,
universe,
&None,
&None,
GeoSortStrategy::default(),
0,

41
milli/src/distance.rs Normal file
View File

@ -0,0 +1,41 @@
use std::ops;
use instant_distance::Point;
use serde::{Deserialize, Serialize};
use crate::normalize_vector;
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct NDotProductPoint(Vec<f32>);
impl NDotProductPoint {
pub fn new(point: Vec<f32>) -> Self {
NDotProductPoint(normalize_vector(point))
}
pub fn into_inner(self) -> Vec<f32> {
self.0
}
}
impl ops::Deref for NDotProductPoint {
type Target = [f32];
fn deref(&self) -> &Self::Target {
self.0.as_slice()
}
}
impl Point for NDotProductPoint {
fn distance(&self, other: &Self) -> f32 {
let dist = 1.0 - dot_product_similarity(&self.0, &other.0);
debug_assert!(!dist.is_nan());
dist
}
}
/// Returns the dot product similarity score that will between 0.0 and 1.0
/// if both vectors are normalized. The higher the more similar the vectors are.
pub fn dot_product_similarity(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b).map(|(a, b)| a * b).sum()
}

View File

@ -61,10 +61,6 @@ pub enum InternalError {
AbortedIndexation,
#[error("The matching words list contains at least one invalid member.")]
InvalidMatchingWords,
#[error(transparent)]
ArroyError(#[from] arroy::Error),
#[error(transparent)]
VectorEmbeddingError(#[from] crate::vector::Error),
}
#[derive(Error, Debug)]
@ -114,10 +110,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
InvalidGeoField(#[from] GeoError),
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
InvalidVectorDimensions { expected: usize, found: usize },
#[error("The `_vectors.{subfield}` field in the document with id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")]
InvalidVectorsType { document_id: Value, value: Value, subfield: String },
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
InvalidVectorsMapType { document_id: Value, value: Value },
#[error("The `_vectors` field in the document with the id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")]
InvalidVectorsType { document_id: Value, value: Value },
#[error("{0}")]
InvalidFilter(String),
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
@ -158,7 +152,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
valid_fields: BTreeSet<String>,
hidden_fields: bool,
},
#[error("an environment is already opened with different options")]
#[error("{}", HeedError::BadOpenOptions)]
InvalidLmdbOpenOptions,
#[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")]
SortRankingRuleMissing,
@ -186,76 +180,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
UnknownInternalDocumentId { document_id: DocumentId },
#[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")]
InvalidMinTypoWordLenSetting(u8, u8),
#[error(transparent)]
VectorEmbeddingError(#[from] crate::vector::Error),
#[error(transparent)]
MissingDocumentField(#[from] crate::prompt::error::RenderPromptError),
#[error(transparent)]
InvalidPrompt(#[from] crate::prompt::error::NewPromptError),
#[error("`.embedders.{0}.documentTemplate`: Invalid template: {1}.")]
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
#[error("Too many embedders in the configuration. Found {0}, but limited to 256.")]
TooManyEmbedders(usize),
#[error("Cannot find embedder with name {0}.")]
InvalidEmbedder(String),
#[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")]
TooManyVectors(String, usize),
#[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}",
allowed_sources_for_field
.iter()
.map(|accepted| format!("`{}`", accepted))
.collect::<Vec<String>>()
.join(", "),
allowed_fields_for_source
.iter()
.map(|accepted| format!("`{}`", accepted))
.collect::<Vec<String>>()
.join(", ")
)]
InvalidFieldForSource {
embedder_name: String,
source_: crate::vector::settings::EmbedderSource,
field: &'static str,
allowed_fields_for_source: &'static [&'static str],
allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource],
},
#[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())]
InvalidOpenAiModel { embedder_name: String, model: String },
#[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")]
MissingFieldForSource {
field: &'static str,
source_: crate::vector::settings::EmbedderSource,
embedder_name: String,
},
}
impl From<crate::vector::Error> for Error {
fn from(value: crate::vector::Error) -> Self {
match value.fault() {
FaultSource::User => Error::UserError(value.into()),
FaultSource::Runtime => Error::InternalError(value.into()),
FaultSource::Bug => Error::InternalError(value.into()),
FaultSource::Undecided => Error::InternalError(value.into()),
}
}
}
impl From<arroy::Error> for Error {
fn from(value: arroy::Error) -> Self {
match value {
arroy::Error::Heed(heed) => heed.into(),
arroy::Error::Io(io) => io.into(),
arroy::Error::InvalidVecDimension { expected, received } => {
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
}
arroy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. }
| arroy::Error::MissingMetadata => {
Error::InternalError(InternalError::ArroyError(value))
}
}
}
}
#[derive(Error, Debug)]
@ -402,36 +326,15 @@ impl From<HeedError> for Error {
HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached),
HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile),
HeedError::Mdb(error) => InternalError(Store(error)),
// TODO use the encoding
HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })),
HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })),
HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })),
HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })),
HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
HeedError::DatabaseClosing => InternalError(DatabaseClosing),
HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions),
HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions),
}
}
}
#[derive(Debug, Clone, Copy)]
pub enum FaultSource {
User,
Runtime,
Bug,
Undecided,
}
impl std::fmt::Display for FaultSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
FaultSource::User => "user error",
FaultSource::Runtime => "runtime error",
FaultSource::Bug => "coding error",
FaultSource::Undecided => "error",
};
f.write_str(s)
}
}
#[test]
fn conditionally_lookup_for_error_message() {
let prefix = "Attribute `name` is not sortable.";

View File

@ -1,6 +1,6 @@
use std::collections::HashMap;
use heed::types::Str;
use heed::types::{OwnedType, Str};
use heed::{Database, RoIter, RoTxn, RwTxn};
use crate::{DocumentId, BEU32};
@ -16,10 +16,10 @@ pub struct DocumentOperation {
pub kind: DocumentOperationKind,
}
pub struct ExternalDocumentsIds(Database<Str, BEU32>);
pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
impl ExternalDocumentsIds {
pub fn new(db: Database<Str, BEU32>) -> ExternalDocumentsIds {
pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
ExternalDocumentsIds(db)
}
@ -29,7 +29,7 @@ impl ExternalDocumentsIds {
}
pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
self.0.get(rtxn, external_id.as_ref())
Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
}
/// An helper function to debug this type, returns an `HashMap` of both,
@ -38,7 +38,7 @@ impl ExternalDocumentsIds {
let mut map = HashMap::default();
for result in self.0.iter(rtxn)? {
let (external, internal) = result?;
map.insert(external.to_owned(), internal);
map.insert(external.to_owned(), internal.get());
}
Ok(map)
}
@ -55,7 +55,7 @@ impl ExternalDocumentsIds {
for DocumentOperation { external_id, internal_id, kind } in operations {
match kind {
DocumentOperationKind::Create => {
self.0.put(wtxn, &external_id, &internal_id)?;
self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
}
DocumentOperationKind::Delete => {
if !self.0.delete(wtxn, &external_id)? {
@ -69,7 +69,7 @@ impl ExternalDocumentsIds {
}
/// Returns an iterator over all the external ids.
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, BEU32>> {
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
self.0.iter(rtxn)
}
}

View File

@ -2,28 +2,26 @@ use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use heed::BoxedError;
pub struct BEU16StrCodec;
impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
type DItem = (u16, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n_bytes, str_bytes) = bytes.split_at(2);
let n = n_bytes.try_into().map(u16::from_be_bytes)?;
let s = str::from_utf8(str_bytes)?;
Ok((n, s))
let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?;
let s = str::from_utf8(str_bytes).ok()?;
Some((n, s))
}
}
impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
type EItem = (u16, &'a str);
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s.len() + 2);
bytes.extend_from_slice(&n.to_be_bytes());
bytes.extend_from_slice(s.as_bytes());
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}

View File

@ -2,28 +2,26 @@ use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use heed::BoxedError;
pub struct BEU32StrCodec;
impl<'a> heed::BytesDecode<'a> for BEU32StrCodec {
type DItem = (u32, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n_bytes, str_bytes) = bytes.split_at(4);
let n = n_bytes.try_into().map(u32::from_be_bytes)?;
let s = str::from_utf8(str_bytes)?;
Ok((n, s))
let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?;
let s = str::from_utf8(str_bytes).ok()?;
Some((n, s))
}
}
impl<'a> heed::BytesEncode<'a> for BEU32StrCodec {
type EItem = (u32, &'a str);
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s.len() + 4);
bytes.extend_from_slice(&n.to_be_bytes());
bytes.extend_from_slice(s.as_bytes());
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}

View File

@ -1,23 +1,23 @@
use std::borrow::Cow;
use heed::{BoxedError, BytesDecode, BytesEncode};
use heed::{BytesDecode, BytesEncode};
/// A codec for values of type `&[u8]`. Unlike `Bytes`, its `EItem` and `DItem` associated
/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated
/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure.
pub struct BytesRefCodec;
pub struct ByteSliceRefCodec;
impl<'a> BytesEncode<'a> for BytesRefCodec {
impl<'a> BytesEncode<'a> for ByteSliceRefCodec {
type EItem = &'a [u8];
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
Ok(Cow::Borrowed(item))
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item))
}
}
impl<'a> BytesDecode<'a> for BytesRefCodec {
impl<'a> BytesDecode<'a> for ByteSliceRefCodec {
type DItem = &'a [u8];
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
Ok(bytes)
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(bytes)
}
}

View File

@ -1,9 +1,8 @@
use std::borrow::Cow;
use std::marker::PhantomData;
use heed::{BoxedError, BytesDecode, BytesEncode};
use heed::{BytesDecode, BytesEncode};
use crate::heed_codec::SliceTooShortError;
use crate::{try_split_array_at, DocumentId, FieldId};
pub struct FieldDocIdFacetCodec<C>(PhantomData<C>);
@ -14,16 +13,16 @@ where
{
type DItem = (FieldId, DocumentId, C::DItem);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (document_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = u32::from_be_bytes(document_id_bytes);
let value = C::bytes_decode(bytes)?;
Ok((field_id, document_id, value))
Some((field_id, document_id, value))
}
}
@ -33,15 +32,13 @@ where
{
type EItem = (FieldId, DocumentId, C::EItem);
fn bytes_encode(
(field_id, document_id, value): &'a Self::EItem,
) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(32);
bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes
bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes
let value_bytes = C::bytes_encode(value)?;
// variable length, if f64 -> 16 bytes, if string -> large, potentially
bytes.extend_from_slice(&value_bytes);
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}

View File

@ -5,8 +5,8 @@ use std::borrow::Cow;
use std::convert::TryFrom;
use std::marker::PhantomData;
use heed::types::DecodeIgnore;
use heed::{BoxedError, BytesDecode, BytesEncode};
use heed::types::{DecodeIgnore, OwnedType};
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;
pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
@ -18,7 +18,7 @@ pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec<DecodeIgnore>;
pub type FieldIdCodec = BEU16;
pub type FieldIdCodec = OwnedType<BEU16>;
/// Tries to split a slice in half at the given middle point,
/// `None` if the slice is too short.
@ -58,7 +58,7 @@ where
{
type EItem = FacetGroupKey<T::EItem>;
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let mut v = vec![];
v.extend_from_slice(&value.field_id.to_be_bytes());
v.extend_from_slice(&[value.level]);
@ -66,7 +66,7 @@ where
let bound = T::bytes_encode(&value.left_bound)?;
v.extend_from_slice(&bound);
Ok(Cow::Owned(v))
Some(Cow::Owned(v))
}
}
impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T>
@ -75,11 +75,11 @@ where
{
type DItem = FacetGroupKey<T::DItem>;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1])?);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?);
let level = bytes[2];
let bound = T::bytes_decode(&bytes[3..])?;
Ok(FacetGroupKey { field_id: fid, level, left_bound: bound })
Some(FacetGroupKey { field_id: fid, level, left_bound: bound })
}
}
@ -87,17 +87,17 @@ pub struct FacetGroupValueCodec;
impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
type EItem = FacetGroupValue;
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let mut v = vec![value.size];
CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v);
Ok(Cow::Owned(v))
Some(Cow::Owned(v))
}
}
impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
type DItem = FacetGroupValue;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let size = bytes[0];
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
Ok(FacetGroupValue { size, bitmap })
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?;
Some(FacetGroupValue { size, bitmap })
}
}

View File

@ -1,45 +1,37 @@
use std::borrow::Cow;
use std::convert::TryInto;
use heed::{BoxedError, BytesDecode};
use thiserror::Error;
use heed::BytesDecode;
use crate::facet::value_encoding::f64_into_bytes;
use crate::heed_codec::SliceTooShortError;
pub struct OrderedF64Codec;
impl<'a> BytesDecode<'a> for OrderedF64Codec {
type DItem = f64;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
if bytes.len() < 16 {
Err(SliceTooShortError.into())
} else {
bytes[8..].try_into().map(f64::from_be_bytes).map_err(Into::into)
return None;
}
let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?;
Some(f)
}
}
impl heed::BytesEncode<'_> for OrderedF64Codec {
type EItem = f64;
fn bytes_encode(f: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode(f: &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 16];
// write the globally ordered float
let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?;
let bytes = f64_into_bytes(*f)?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 value just to be able to read it back
let bytes = f.to_be_bytes();
buffer[8..16].copy_from_slice(&bytes[..]);
Ok(Cow::Owned(buffer.to_vec()))
Some(Cow::Owned(buffer.to_vec()))
}
}
#[derive(Error, Debug)]
#[error("the float {float} cannot be converted to a globally ordered representation")]
pub struct InvalidGloballyOrderedFloatError {
float: f64,
}

View File

@ -1,8 +1,5 @@
use std::borrow::Cow;
use heed::BoxedError;
use super::SliceTooShortError;
use crate::{try_split_array_at, FieldId};
pub struct FieldIdWordCountCodec;
@ -10,21 +7,21 @@ pub struct FieldIdWordCountCodec;
impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
type DItem = (FieldId, u8);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let ([word_count], _nothing) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
Ok((field_id, word_count))
let ([word_count], _nothing) = try_split_array_at(bytes)?;
Some((field_id, word_count))
}
}
impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
type EItem = (FieldId, u8);
fn bytes_encode((field_id, word_count): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(2 + 1);
bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.push(*word_count);
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}

View File

@ -1,7 +1,7 @@
use std::borrow::Cow;
use fst::Set;
use heed::{BoxedError, BytesDecode, BytesEncode};
use heed::{BytesDecode, BytesEncode};
/// A codec for values of type `Set<&[u8]>`.
pub struct FstSetCodec;
@ -9,15 +9,15 @@ pub struct FstSetCodec;
impl<'a> BytesEncode<'a> for FstSetCodec {
type EItem = Set<Vec<u8>>;
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
Ok(Cow::Borrowed(item.as_fst().as_bytes()))
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item.as_fst().as_bytes()))
}
}
impl<'a> BytesDecode<'a> for FstSetCodec {
type DItem = Set<&'a [u8]>;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
Set::new(bytes).map_err(Into::into)
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Set::new(bytes).ok()
}
}

View File

@ -12,10 +12,8 @@ mod str_beu32_codec;
mod str_ref;
mod str_str_u8_codec;
pub use byte_slice_ref::BytesRefCodec;
use heed::BoxedError;
pub use byte_slice_ref::ByteSliceRefCodec;
pub use str_ref::StrRefCodec;
use thiserror::Error;
pub use self::beu16_str_codec::BEU16StrCodec;
pub use self::beu32_str_codec::BEU32StrCodec;
@ -33,9 +31,5 @@ pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
pub trait BytesDecodeOwned {
type DItem;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError>;
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem>;
}
#[derive(Error, Debug)]
#[error("the slice is too short")]
pub struct SliceTooShortError;

View File

@ -1,6 +1,5 @@
use std::borrow::Cow;
use heed::BoxedError;
use obkv::{KvReaderU16, KvWriterU16};
pub struct ObkvCodec;
@ -8,15 +7,15 @@ pub struct ObkvCodec;
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
type DItem = KvReaderU16<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
Ok(KvReaderU16::new(bytes))
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(KvReaderU16::new(bytes))
}
}
impl heed::BytesEncode<'_> for ObkvCodec {
type EItem = KvWriterU16<Vec<u8>>;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
item.clone().into_inner().map(Cow::Owned).map_err(Into::into)
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
item.clone().into_inner().map(Cow::Owned).ok()
}
}

View File

@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::convert::TryInto;
use std::mem::size_of;
use heed::{BoxedError, BytesDecode};
use heed::BytesDecode;
use roaring::RoaringBitmap;
use crate::heed_codec::BytesDecodeOwned;
@ -19,22 +19,22 @@ impl BoRoaringBitmapCodec {
impl BytesDecode<'_> for BoRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
let mut bitmap = RoaringBitmap::new();
for chunk in bytes.chunks(size_of::<u32>()) {
let bytes = chunk.try_into()?;
let bytes = chunk.try_into().ok()?;
bitmap.push(u32::from_ne_bytes(bytes));
}
Ok(bitmap)
Some(bitmap)
}
}
impl BytesDecodeOwned for BoRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
Self::bytes_decode(bytes)
}
}
@ -42,9 +42,9 @@ impl BytesDecodeOwned for BoRoaringBitmapCodec {
impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut out = Vec::new();
BoRoaringBitmapCodec::serialize_into(item, &mut out);
Ok(Cow::Owned(out))
Some(Cow::Owned(out))
}
}

View File

@ -3,7 +3,6 @@ use std::io;
use std::mem::size_of;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use heed::BoxedError;
use roaring::RoaringBitmap;
use crate::heed_codec::BytesDecodeOwned;
@ -133,26 +132,26 @@ impl CboRoaringBitmapCodec {
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
Self::deserialize_from(bytes).map_err(Into::into)
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
Self::deserialize_from(bytes).ok()
}
}
impl BytesDecodeOwned for CboRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
Self::deserialize_from(bytes).map_err(Into::into)
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
Self::deserialize_from(bytes).ok()
}
}
impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut vec = Vec::with_capacity(Self::serialized_size(item));
Self::serialize_into(item, &mut vec);
Ok(Cow::Owned(vec))
Some(Cow::Owned(vec))
}
}

View File

@ -1,6 +1,5 @@
use std::borrow::Cow;
use heed::BoxedError;
use roaring::RoaringBitmap;
use crate::heed_codec::BytesDecodeOwned;
@ -10,25 +9,25 @@ pub struct RoaringBitmapCodec;
impl heed::BytesDecode<'_> for RoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
RoaringBitmap::deserialize_unchecked_from(bytes).map_err(Into::into)
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmap::deserialize_unchecked_from(bytes).ok()
}
}
impl BytesDecodeOwned for RoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
RoaringBitmap::deserialize_from(bytes).map_err(Into::into)
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmap::deserialize_from(bytes).ok()
}
}
impl heed::BytesEncode<'_> for RoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(item.serialized_size());
item.serialize_into(&mut bytes)?;
Ok(Cow::Owned(bytes))
item.serialize_into(&mut bytes).ok()?;
Some(Cow::Owned(bytes))
}
}

View File

@ -1,6 +1,6 @@
use std::mem;
use heed::{BoxedError, BytesDecode};
use heed::BytesDecode;
use crate::heed_codec::BytesDecodeOwned;
@ -9,15 +9,15 @@ pub struct BoRoaringBitmapLenCodec;
impl BytesDecode<'_> for BoRoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
Ok((bytes.len() / mem::size_of::<u32>()) as u64)
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
Some((bytes.len() / mem::size_of::<u32>()) as u64)
}
}
impl BytesDecodeOwned for BoRoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
Self::bytes_decode(bytes)
}
}

View File

@ -1,6 +1,6 @@
use std::mem;
use heed::{BoxedError, BytesDecode};
use heed::BytesDecode;
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
@ -11,7 +11,7 @@ pub struct CboRoaringBitmapLenCodec;
impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
if bytes.len() <= THRESHOLD * mem::size_of::<u32>() {
// If there is threshold or less than threshold integers that can fit into this array
// of bytes it means that we used the ByteOrder codec serializer.
@ -27,7 +27,7 @@ impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
impl BytesDecodeOwned for CboRoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
Self::bytes_decode(bytes)
}
}

View File

@ -2,7 +2,6 @@ use std::io::{self, BufRead, Read};
use std::mem;
use byteorder::{LittleEndian, ReadBytesExt};
use heed::BoxedError;
use crate::heed_codec::BytesDecodeOwned;
@ -57,16 +56,16 @@ impl RoaringBitmapLenCodec {
impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok()
}
}
impl BytesDecodeOwned for RoaringBitmapLenCodec {
type DItem = u64;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok()
}
}

View File

@ -1,31 +1,30 @@
use std::borrow::Cow;
use std::ffi::CStr;
use std::str;
use charabia::{Language, Script};
use heed::BoxedError;
pub struct ScriptLanguageCodec;
impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
type DItem = (Script, Language);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let cstr = CStr::from_bytes_until_nul(bytes)?;
let script = cstr.to_str()?;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let sep = bytes.iter().position(|b| *b == 0)?;
let (s_bytes, l_bytes) = bytes.split_at(sep);
let script = str::from_utf8(s_bytes).ok()?;
let script_name = Script::from_name(script);
let lan = str::from_utf8(l_bytes).ok()?;
// skip '\0' byte between the two strings.
let lan = str::from_utf8(&bytes[script.len() + 1..])?;
let lan_name = Language::from_name(lan);
let lan_name = Language::from_name(&lan[1..]);
Ok((script_name, lan_name))
Some((script_name, lan_name))
}
}
impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
type EItem = (Script, Language);
fn bytes_encode((script, lan): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
let script_name = script.name().as_bytes();
let lan_name = lan.name().as_bytes();
@ -34,6 +33,6 @@ impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
bytes.push(0);
bytes.extend_from_slice(lan_name);
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}

View File

@ -3,41 +3,37 @@ use std::convert::TryInto;
use std::mem::size_of;
use std::str;
use heed::BoxedError;
use super::SliceTooShortError;
pub struct StrBEU32Codec;
impl<'a> heed::BytesDecode<'a> for StrBEU32Codec {
type DItem = (&'a str, u32);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u32>();
if bytes.len() < footer_len {
return Err(SliceTooShortError.into());
return None;
}
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
let word = str::from_utf8(word)?;
let pos = bytes.try_into().map(u32::from_be_bytes)?;
let word = str::from_utf8(word).ok()?;
let pos = bytes.try_into().map(u32::from_be_bytes).ok()?;
Ok((word, pos))
Some((word, pos))
}
}
impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
type EItem = (&'a str, u32);
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
let pos = pos.to_be_bytes();
let mut bytes = Vec::with_capacity(word.len() + pos.len());
bytes.extend_from_slice(word.as_bytes());
bytes.extend_from_slice(&pos[..]);
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}
@ -46,27 +42,26 @@ pub struct StrBEU16Codec;
impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
type DItem = (&'a str, u16);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u16>();
if bytes.len() < footer_len + 1 {
return Err(SliceTooShortError.into());
return None;
}
let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len);
// unwrap: we just checked the footer + 1 above.
let (_, word) = word_plus_nul_byte.split_last().unwrap();
let word = str::from_utf8(word)?;
let pos = bytes.try_into().map(u16::from_be_bytes)?;
let (_, word) = word_plus_nul_byte.split_last()?;
let word = str::from_utf8(word).ok()?;
let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
Ok((word, pos))
Some((word, pos))
}
}
impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
type EItem = (&'a str, u16);
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
let pos = pos.to_be_bytes();
let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len());
@ -74,6 +69,6 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
bytes.push(0);
bytes.extend_from_slice(&pos[..]);
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}

View File

@ -1,6 +1,6 @@
use std::borrow::Cow;
use heed::{BoxedError, BytesDecode, BytesEncode};
use heed::{BytesDecode, BytesEncode};
/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated
/// types are equivalent (= `&'a str`) and these values can reside within another structure.
@ -8,14 +8,15 @@ pub struct StrRefCodec;
impl<'a> BytesEncode<'a> for StrRefCodec {
type EItem = &'a str;
fn bytes_encode(item: &'a &'a str) -> Result<Cow<'a, [u8]>, BoxedError> {
Ok(Cow::Borrowed(item.as_bytes()))
fn bytes_encode(item: &'a &'a str) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item.as_bytes()))
}
}
impl<'a> BytesDecode<'a> for StrRefCodec {
type DItem = &'a str;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
std::str::from_utf8(bytes).map_err(Into::into)
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let s = std::str::from_utf8(bytes).ok()?;
Some(s)
}
}

View File

@ -1,36 +1,32 @@
use std::borrow::Cow;
use std::ffi::CStr;
use std::str;
use heed::BoxedError;
use super::SliceTooShortError;
pub struct U8StrStrCodec;
impl<'a> heed::BytesDecode<'a> for U8StrStrCodec {
type DItem = (u8, &'a str, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
let cstr = CStr::from_bytes_until_nul(bytes)?;
let s1 = cstr.to_str()?;
// skip '\0' byte between the two strings.
let s2 = str::from_utf8(&bytes[s1.len() + 1..])?;
Ok((*n, s1, s2))
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n, bytes) = bytes.split_first()?;
let s1_end = bytes.iter().position(|b| *b == 0)?;
let (s1_bytes, rest) = bytes.split_at(s1_end);
let s2_bytes = &rest[1..];
let s1 = str::from_utf8(s1_bytes).ok()?;
let s2 = str::from_utf8(s2_bytes).ok()?;
Some((*n, s1, s2))
}
}
impl<'a> heed::BytesEncode<'a> for U8StrStrCodec {
type EItem = (u8, &'a str, &'a str);
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
bytes.push(*n);
bytes.extend_from_slice(s1.as_bytes());
bytes.push(0);
bytes.extend_from_slice(s2.as_bytes());
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}
pub struct UncheckedU8StrStrCodec;
@ -38,25 +34,24 @@ pub struct UncheckedU8StrStrCodec;
impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec {
type DItem = (u8, &'a [u8], &'a [u8]);
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
let cstr = CStr::from_bytes_until_nul(bytes)?;
let s1_bytes = cstr.to_bytes();
// skip '\0' byte between the two strings.
let s2_bytes = &bytes[s1_bytes.len() + 1..];
Ok((*n, s1_bytes, s2_bytes))
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n, bytes) = bytes.split_first()?;
let s1_end = bytes.iter().position(|b| *b == 0)?;
let (s1_bytes, rest) = bytes.split_at(s1_end);
let s2_bytes = &rest[1..];
Some((*n, s1_bytes, s2_bytes))
}
}
impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec {
type EItem = (u8, &'a [u8], &'a [u8]);
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
bytes.push(*n);
bytes.extend_from_slice(s1);
bytes.push(0);
bytes.extend_from_slice(s2);
Ok(Cow::Owned(bytes))
Some(Cow::Owned(bytes))
}
}

File diff suppressed because it is too large Load Diff

View File

@ -10,18 +10,18 @@ pub mod documents;
mod asc_desc;
mod criterion;
pub mod distance;
mod error;
mod external_documents_ids;
pub mod facet;
mod fields_ids_map;
pub mod heed_codec;
pub mod index;
pub mod prompt;
pub mod proximity;
mod readable_slices;
pub mod score_details;
mod search;
pub mod update;
pub mod vector;
#[cfg(test)]
#[macro_use]
@ -32,12 +32,13 @@ use std::convert::{TryFrom, TryInto};
use std::hash::BuildHasherDefault;
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
pub use distance::dot_product_similarity;
pub use filter_parser::{Condition, FilterCondition, Span, Token};
use fxhash::{FxHasher32, FxHasher64};
pub use grenad::CompressionType;
pub use search::new::{
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext,
SearchLogger, VisualSearchLogger,
execute_search, DefaultSearchLogger, GeoSortStrategy, SearchContext, SearchLogger,
VisualSearchLogger,
};
use serde_json::Value;
pub use {charabia as tokenizer, heed};
@ -65,9 +66,9 @@ pub use self::search::{
pub type Result<T> = std::result::Result<T, error::Error>;
pub type Attribute = u32;
pub type BEU16 = heed::types::U16<heed::byteorder::BE>;
pub type BEU32 = heed::types::U32<heed::byteorder::BE>;
pub type BEU64 = heed::types::U64<heed::byteorder::BE>;
pub type BEU16 = heed::zerocopy::U16<heed::byteorder::BE>;
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
pub type DocumentId = u32;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;

View File

@ -1,97 +0,0 @@
use liquid::model::{
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
};
use liquid::{ObjectView, ValueView};
use super::document::Document;
use super::fields::Fields;
use crate::FieldsIdsMap;
#[derive(Debug, Clone)]
pub struct Context<'a> {
document: &'a Document<'a>,
fields: Fields<'a>,
}
impl<'a> Context<'a> {
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
Self { document, fields: Fields::new(document, field_id_map) }
}
}
impl<'a> ObjectView for Context<'a> {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
2
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(
std::iter::once(self.document.as_value())
.chain(std::iter::once(self.fields.as_value())),
)
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(self.keys().zip(self.values()))
}
fn contains_key(&self, index: &str) -> bool {
index == "doc" || index == "fields"
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
match index {
"doc" => Some(self.document.as_value()),
"fields" => Some(self.fields.as_value()),
_ => None,
}
}
}
impl<'a> ValueView for Context<'a> {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
}
fn source(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: liquid::model::State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue | State::Empty | State::Blank => false,
}
}
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
let s = ObjectRender::new(self).to_string();
KStringCow::from_string(s)
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Object(
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
)
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}

View File

@ -1,131 +0,0 @@
use std::cell::OnceCell;
use std::collections::BTreeMap;
use liquid::model::{
DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
};
use liquid::{ObjectView, ValueView};
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::FieldsIdsMap;
#[derive(Debug, Clone)]
pub struct Document<'a>(BTreeMap<&'a str, (&'a [u8], ParsedValue)>);
#[derive(Debug, Clone)]
struct ParsedValue(std::cell::OnceCell<LiquidValue>);
impl ParsedValue {
fn empty() -> ParsedValue {
ParsedValue(OnceCell::new())
}
fn get(&self, raw: &[u8]) -> &LiquidValue {
self.0.get_or_init(|| {
let value: serde_json::Value = serde_json::from_slice(raw).unwrap();
liquid::model::to_value(&value).unwrap()
})
}
}
impl<'a> Document<'a> {
pub fn new(
data: obkv::KvReaderU16<'a>,
side: DelAdd,
inverted_field_map: &'a FieldsIdsMap,
) -> Self {
let mut out_data = BTreeMap::new();
for (fid, raw) in data {
let obkv = KvReaderDelAdd::new(raw);
let Some(raw) = obkv.get(side) else {
continue;
};
let Some(name) = inverted_field_map.name(fid) else {
continue;
};
out_data.insert(name, (raw, ParsedValue::empty()));
}
Self(out_data)
}
fn is_empty(&self) -> bool {
self.0.is_empty()
}
fn len(&self) -> usize {
self.0.len()
}
fn iter(&self) -> impl Iterator<Item = (KString, LiquidValue)> + '_ {
self.0.iter().map(|(&k, (raw, data))| (k.to_owned().into(), data.get(raw).to_owned()))
}
}
impl<'a> ObjectView for Document<'a> {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
self.len() as i64
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
let keys = BTreeMap::keys(&self.0).map(|&s| s.into());
Box::new(keys)
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(self.0.values().map(|(raw, v)| v.get(raw) as &dyn ValueView))
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(self.0.iter().map(|(&k, (raw, data))| (k.into(), data.get(raw) as &dyn ValueView)))
}
fn contains_key(&self, index: &str) -> bool {
self.0.contains_key(index)
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
self.0.get(index).map(|(raw, v)| v.get(raw) as &dyn ValueView)
}
}
impl<'a> ValueView for Document<'a> {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
}
fn source(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: liquid::model::State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue | State::Empty | State::Blank => self.is_empty(),
}
}
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
let s = ObjectRender::new(self).to_string();
KStringCow::from_string(s)
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Object(self.iter().collect())
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}

View File

@ -1,56 +0,0 @@
use crate::error::FaultSource;
#[derive(Debug, thiserror::Error)]
#[error("{fault}: {kind}")]
pub struct NewPromptError {
pub kind: NewPromptErrorKind,
pub fault: FaultSource,
}
impl From<NewPromptError> for crate::Error {
fn from(value: NewPromptError) -> Self {
crate::Error::UserError(crate::UserError::InvalidPrompt(value))
}
}
impl NewPromptError {
pub(crate) fn cannot_parse_template(inner: liquid::Error) -> NewPromptError {
Self { kind: NewPromptErrorKind::CannotParseTemplate(inner), fault: FaultSource::User }
}
pub(crate) fn invalid_fields_in_template(inner: liquid::Error) -> NewPromptError {
Self { kind: NewPromptErrorKind::InvalidFieldsInTemplate(inner), fault: FaultSource::User }
}
}
#[derive(Debug, thiserror::Error)]
pub enum NewPromptErrorKind {
#[error("cannot parse template: {0}")]
CannotParseTemplate(liquid::Error),
#[error("template contains invalid fields: {0}. Only `doc.*`, `fields[i].name`, `fields[i].value` are supported")]
InvalidFieldsInTemplate(liquid::Error),
}
#[derive(Debug, thiserror::Error)]
#[error("{fault}: {kind}")]
pub struct RenderPromptError {
pub kind: RenderPromptErrorKind,
pub fault: FaultSource,
}
impl RenderPromptError {
pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError {
Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User }
}
}
#[derive(Debug, thiserror::Error)]
pub enum RenderPromptErrorKind {
#[error("missing field in document: {0}")]
MissingContext(liquid::Error),
}
impl From<RenderPromptError> for crate::Error {
fn from(value: RenderPromptError) -> Self {
crate::Error::UserError(crate::UserError::MissingDocumentField(value))
}
}

View File

@ -1,172 +0,0 @@
use liquid::model::{
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
};
use liquid::{ObjectView, ValueView};
use super::document::Document;
use crate::FieldsIdsMap;
#[derive(Debug, Clone)]
pub struct Fields<'a>(Vec<FieldValue<'a>>);
impl<'a> Fields<'a> {
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
Self(
std::iter::repeat(document)
.zip(field_id_map.iter())
.map(|(document, (_fid, name))| FieldValue { document, name })
.collect(),
)
}
}
#[derive(Debug, Clone, Copy)]
pub struct FieldValue<'a> {
name: &'a str,
document: &'a Document<'a>,
}
impl<'a> ValueView for FieldValue<'a> {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
}
fn source(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: liquid::model::State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue | State::Empty | State::Blank => self.is_empty(),
}
}
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
let s = ObjectRender::new(self).to_string();
KStringCow::from_string(s)
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Object(
self.iter().map(|(k, v)| (k.to_string().into(), v.to_value())).collect(),
)
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}
impl<'a> FieldValue<'a> {
pub fn name(&self) -> &&'a str {
&self.name
}
pub fn value(&self) -> &dyn ValueView {
self.document.get(self.name).unwrap_or(&LiquidValue::Nil)
}
pub fn is_empty(&self) -> bool {
self.size() == 0
}
}
impl<'a> ObjectView for FieldValue<'a> {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
2
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(["name", "value"].iter().map(|&x| KStringCow::from_static(x)))
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(
std::iter::once(self.name() as &dyn ValueView).chain(std::iter::once(self.value())),
)
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(self.keys().zip(self.values()))
}
fn contains_key(&self, index: &str) -> bool {
index == "name" || index == "value"
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
match index {
"name" => Some(self.name()),
"value" => Some(self.value()),
_ => None,
}
}
}
impl<'a> ArrayView for Fields<'a> {
fn as_value(&self) -> &dyn ValueView {
self.0.as_value()
}
fn size(&self) -> i64 {
self.0.len() as i64
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
self.0.values()
}
fn contains_key(&self, index: i64) -> bool {
self.0.contains_key(index)
}
fn get(&self, index: i64) -> Option<&dyn ValueView> {
ArrayView::get(&self.0, index)
}
}
impl<'a> ValueView for Fields<'a> {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> liquid::model::DisplayCow<'_> {
self.0.render()
}
fn source(&self) -> liquid::model::DisplayCow<'_> {
self.0.source()
}
fn type_name(&self) -> &'static str {
self.0.type_name()
}
fn query_state(&self, state: liquid::model::State) -> bool {
self.0.query_state(state)
}
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
self.0.to_kstr()
}
fn to_value(&self) -> LiquidValue {
self.0.to_value()
}
fn as_array(&self) -> Option<&dyn ArrayView> {
Some(self)
}
}

View File

@ -1,176 +0,0 @@
mod context;
mod document;
pub(crate) mod error;
mod fields;
mod template_checker;
use std::convert::TryFrom;
use error::{NewPromptError, RenderPromptError};
use self::context::Context;
use self::document::Document;
use crate::update::del_add::DelAdd;
use crate::FieldsIdsMap;
pub struct Prompt {
template: liquid::Template,
template_text: String,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct PromptData {
pub template: String,
}
impl From<Prompt> for PromptData {
fn from(value: Prompt) -> Self {
Self { template: value.template_text }
}
}
impl TryFrom<PromptData> for Prompt {
type Error = NewPromptError;
fn try_from(value: PromptData) -> Result<Self, Self::Error> {
Prompt::new(value.template)
}
}
impl Clone for Prompt {
fn clone(&self) -> Self {
let template_text = self.template_text.clone();
Self { template: new_template(&template_text).unwrap(), template_text }
}
}
fn new_template(text: &str) -> Result<liquid::Template, liquid::Error> {
liquid::ParserBuilder::with_stdlib().build().unwrap().parse(text)
}
fn default_template() -> liquid::Template {
new_template(default_template_text()).unwrap()
}
fn default_template_text() -> &'static str {
"{% for field in fields %} \
{{ field.name }}: {{ field.value }}\n\
{% endfor %}"
}
impl Default for Prompt {
fn default() -> Self {
Self { template: default_template(), template_text: default_template_text().into() }
}
}
impl Default for PromptData {
fn default() -> Self {
Self { template: default_template_text().into() }
}
}
impl Prompt {
pub fn new(template: String) -> Result<Self, NewPromptError> {
let this = Self {
template: liquid::ParserBuilder::with_stdlib()
.build()
.unwrap()
.parse(&template)
.map_err(NewPromptError::cannot_parse_template)?,
template_text: template,
};
// render template with special object that's OK with `doc.*` and `fields.*`
this.template
.render(&template_checker::TemplateChecker)
.map_err(NewPromptError::invalid_fields_in_template)?;
Ok(this)
}
pub fn render(
&self,
document: obkv::KvReaderU16<'_>,
side: DelAdd,
field_id_map: &FieldsIdsMap,
) -> Result<String, RenderPromptError> {
let document = Document::new(document, side, field_id_map);
let context = Context::new(&document, field_id_map);
self.template.render(&context).map_err(RenderPromptError::missing_context)
}
}
#[cfg(test)]
mod test {
use super::Prompt;
use crate::error::FaultSource;
use crate::prompt::error::{NewPromptError, NewPromptErrorKind};
#[test]
fn default_template() {
// does not panic
Prompt::default();
}
#[test]
fn empty_template() {
Prompt::new("".into()).unwrap();
}
#[test]
fn template_ok() {
Prompt::new("{{doc.title}}: {{doc.overview}}".into()).unwrap();
}
#[test]
fn template_syntax() {
assert!(matches!(
Prompt::new("{{doc.title: {{doc.overview}}".into()),
Err(NewPromptError {
kind: NewPromptErrorKind::CannotParseTemplate(_),
fault: FaultSource::User
})
));
}
#[test]
fn template_missing_doc() {
assert!(matches!(
Prompt::new("{{title}}: {{overview}}".into()),
Err(NewPromptError {
kind: NewPromptErrorKind::InvalidFieldsInTemplate(_),
fault: FaultSource::User
})
));
}
#[test]
fn template_nested_doc() {
Prompt::new("{{doc.actor.firstName}}: {{doc.actor.lastName}}".into()).unwrap();
}
#[test]
fn template_fields() {
Prompt::new("{% for field in fields %}{{field}}{% endfor %}".into()).unwrap();
}
#[test]
fn template_fields_ok() {
Prompt::new("{% for field in fields %}{{field.name}}: {{field.value}}{% endfor %}".into())
.unwrap();
}
#[test]
fn template_fields_invalid() {
assert!(matches!(
// intentionally garbled field
Prompt::new("{% for field in fields %}{{field.vaelu}} {% endfor %}".into()),
Err(NewPromptError {
kind: NewPromptErrorKind::InvalidFieldsInTemplate(_),
fault: FaultSource::User
})
));
}
}

View File

@ -1,301 +0,0 @@
use liquid::model::{
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
};
use liquid::{Object, ObjectView, ValueView};
#[derive(Debug)]
pub struct TemplateChecker;
#[derive(Debug)]
pub struct DummyDoc;
#[derive(Debug)]
pub struct DummyFields;
#[derive(Debug)]
pub struct DummyField;
const DUMMY_VALUE: &LiquidValue = &LiquidValue::Nil;
impl ObjectView for DummyField {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
2
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(["name", "value"].iter().map(|s| KStringCow::from_static(s)))
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(vec![DUMMY_VALUE.as_view(), DUMMY_VALUE.as_view()].into_iter())
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(self.keys().zip(self.values()))
}
fn contains_key(&self, index: &str) -> bool {
index == "name" || index == "value"
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
if self.contains_key(index) {
Some(DUMMY_VALUE.as_view())
} else {
None
}
}
}
impl ValueView for DummyField {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> DisplayCow<'_> {
DUMMY_VALUE.render()
}
fn source(&self) -> DisplayCow<'_> {
DUMMY_VALUE.source()
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue => false,
State::Empty => false,
State::Blank => false,
}
}
fn to_kstr(&self) -> KStringCow<'_> {
DUMMY_VALUE.to_kstr()
}
fn to_value(&self) -> LiquidValue {
let mut this = Object::new();
this.insert("name".into(), LiquidValue::Nil);
this.insert("value".into(), LiquidValue::Nil);
LiquidValue::Object(this)
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}
impl ValueView for DummyFields {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> DisplayCow<'_> {
DUMMY_VALUE.render()
}
fn source(&self) -> DisplayCow<'_> {
DUMMY_VALUE.source()
}
fn type_name(&self) -> &'static str {
"array"
}
fn query_state(&self, state: State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue => false,
State::Empty => false,
State::Blank => false,
}
}
fn to_kstr(&self) -> KStringCow<'_> {
DUMMY_VALUE.to_kstr()
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Array(vec![DummyField.to_value()])
}
fn as_array(&self) -> Option<&dyn ArrayView> {
Some(self)
}
}
impl ArrayView for DummyFields {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
u16::MAX as i64
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(std::iter::once(DummyField.as_value()))
}
fn contains_key(&self, index: i64) -> bool {
index < self.size()
}
fn get(&self, _index: i64) -> Option<&dyn ValueView> {
Some(DummyField.as_value())
}
}
impl ObjectView for DummyDoc {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
1000
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(std::iter::empty())
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(std::iter::empty())
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(std::iter::empty())
}
fn contains_key(&self, _index: &str) -> bool {
true
}
fn get<'s>(&'s self, _index: &str) -> Option<&'s dyn ValueView> {
// Recursively sends itself
Some(self)
}
}
impl ValueView for DummyDoc {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> DisplayCow<'_> {
DUMMY_VALUE.render()
}
fn source(&self) -> DisplayCow<'_> {
DUMMY_VALUE.source()
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue => false,
State::Empty => false,
State::Blank => false,
}
}
fn to_kstr(&self) -> KStringCow<'_> {
DUMMY_VALUE.to_kstr()
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Nil
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}
impl ObjectView for TemplateChecker {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
2
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(
std::iter::once(DummyDoc.as_value()).chain(std::iter::once(DummyFields.as_value())),
)
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(self.keys().zip(self.values()))
}
fn contains_key(&self, index: &str) -> bool {
index == "doc" || index == "fields"
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
match index {
"doc" => Some(DummyDoc.as_value()),
"fields" => Some(DummyFields.as_value()),
_ => None,
}
}
}
impl ValueView for TemplateChecker {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
}
fn source(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: liquid::model::State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue | State::Empty | State::Blank => false,
}
}
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
let s = ObjectRender::new(self).to_string();
KStringCow::from_string(s)
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Object(
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
)
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}

View File

@ -1,7 +1,5 @@
use std::cmp;
use serde::{Deserialize, Serialize};
use crate::{relative_from_absolute_position, Position};
pub const MAX_DISTANCE: u32 = 4;
@ -27,11 +25,3 @@ pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
pub fn path_proximity(path: &[Position]) -> u32 {
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
pub enum ProximityPrecision {
#[default]
ByWord,
ByAttribute,
}

View File

@ -0,0 +1,85 @@
use std::io::{self, Read};
use std::iter::FromIterator;
pub struct ReadableSlices<A> {
inner: Vec<A>,
pos: u64,
}
impl<A> FromIterator<A> for ReadableSlices<A> {
fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
ReadableSlices { inner: iter.into_iter().collect(), pos: 0 }
}
}
impl<A: AsRef<[u8]>> Read for ReadableSlices<A> {
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
let original_buf_len = buf.len();
// We explore the list of slices to find the one where we must start reading.
let mut pos = self.pos;
let index = match self
.inner
.iter()
.map(|s| s.as_ref().len() as u64)
.position(|size| pos.checked_sub(size).map(|p| pos = p).is_none())
{
Some(index) => index,
None => return Ok(0),
};
let mut inner_pos = pos as usize;
for slice in &self.inner[index..] {
let slice = &slice.as_ref()[inner_pos..];
if buf.len() > slice.len() {
// We must exhaust the current slice and go to the next one there is not enough here.
buf[..slice.len()].copy_from_slice(slice);
buf = &mut buf[slice.len()..];
inner_pos = 0;
} else {
// There is enough in this slice to fill the remaining bytes of the buffer.
// Let's break just after filling it.
buf.copy_from_slice(&slice[..buf.len()]);
buf = &mut [];
break;
}
}
let written = original_buf_len - buf.len();
self.pos += written as u64;
Ok(written)
}
}
#[cfg(test)]
mod test {
use std::io::Read;
use super::ReadableSlices;
#[test]
fn basic() {
let data: Vec<_> = (0..100).collect();
let splits: Vec<_> = data.chunks(3).collect();
let mut rdslices: ReadableSlices<_> = splits.into_iter().collect();
let mut output = Vec::new();
let length = rdslices.read_to_end(&mut output).unwrap();
assert_eq!(length, data.len());
assert_eq!(output, data);
}
#[test]
fn small_reads() {
let data: Vec<_> = (0..u8::MAX).collect();
let splits: Vec<_> = data.chunks(27).collect();
let mut rdslices: ReadableSlices<_> = splits.into_iter().collect();
let buffer = &mut [0; 45];
let length = rdslices.read(buffer).unwrap();
let expected: Vec<_> = (0..buffer.len() as u8).collect();
assert_eq!(length, buffer.len());
assert_eq!(buffer, &expected[..]);
}
}

View File

@ -1,6 +1,3 @@
use std::cmp::Ordering;
use itertools::Itertools;
use serde::Serialize;
use crate::distance_between_two_points;
@ -15,24 +12,9 @@ pub enum ScoreDetails {
ExactAttribute(ExactAttribute),
ExactWords(ExactWords),
Sort(Sort),
Vector(Vector),
GeoSort(GeoSort),
}
#[derive(Clone, Copy)]
pub enum ScoreValue<'a> {
Score(f64),
Sort(&'a Sort),
GeoSort(&'a GeoSort),
}
enum RankOrValue<'a> {
Rank(Rank),
Sort(&'a Sort),
GeoSort(&'a GeoSort),
Score(f64),
}
impl ScoreDetails {
pub fn local_score(&self) -> Option<f64> {
self.rank().map(Rank::local_score)
@ -49,55 +31,11 @@ impl ScoreDetails {
ScoreDetails::ExactWords(details) => Some(details.rank()),
ScoreDetails::Sort(_) => None,
ScoreDetails::GeoSort(_) => None,
ScoreDetails::Vector(_) => None,
}
}
pub fn global_score<'a>(details: impl Iterator<Item = &'a Self> + 'a) -> f64 {
Self::score_values(details)
.find_map(|x| {
let ScoreValue::Score(score) = x else {
return None;
};
Some(score)
})
.unwrap_or(1.0f64)
}
pub fn score_values<'a>(
details: impl Iterator<Item = &'a Self> + 'a,
) -> impl Iterator<Item = ScoreValue<'a>> + 'a {
details
.map(ScoreDetails::rank_or_value)
.coalesce(|left, right| match (left, right) {
(RankOrValue::Rank(left), RankOrValue::Rank(right)) => {
Ok(RankOrValue::Rank(Rank::merge(left, right)))
}
(left, right) => Err((left, right)),
})
.map(|rank_or_value| match rank_or_value {
RankOrValue::Rank(r) => ScoreValue::Score(r.local_score()),
RankOrValue::Sort(s) => ScoreValue::Sort(s),
RankOrValue::GeoSort(g) => ScoreValue::GeoSort(g),
RankOrValue::Score(s) => ScoreValue::Score(s),
})
}
fn rank_or_value(&self) -> RankOrValue<'_> {
match self {
ScoreDetails::Words(w) => RankOrValue::Rank(w.rank()),
ScoreDetails::Typo(t) => RankOrValue::Rank(t.rank()),
ScoreDetails::Proximity(p) => RankOrValue::Rank(*p),
ScoreDetails::Fid(f) => RankOrValue::Rank(*f),
ScoreDetails::Position(p) => RankOrValue::Rank(*p),
ScoreDetails::ExactAttribute(e) => RankOrValue::Rank(e.rank()),
ScoreDetails::ExactWords(e) => RankOrValue::Rank(e.rank()),
ScoreDetails::Sort(sort) => RankOrValue::Sort(sort),
ScoreDetails::GeoSort(geosort) => RankOrValue::GeoSort(geosort),
ScoreDetails::Vector(vector) => RankOrValue::Score(
vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64),
),
}
pub fn global_score<'a>(details: impl Iterator<Item = &'a Self>) -> f64 {
Rank::global_score(details.filter_map(Self::rank))
}
/// Panics
@ -243,19 +181,6 @@ impl ScoreDetails {
details_map.insert(sort, sort_details);
order += 1;
}
ScoreDetails::Vector(s) => {
let vector = format!("vectorSort({:?})", s.target_vector);
let value = s.value_similarity.as_ref().map(|(v, _)| v);
let similarity = s.value_similarity.as_ref().map(|(_, s)| s);
let details = serde_json::json!({
"order": order,
"value": value,
"similarity": similarity,
});
details_map.insert(vector, details);
order += 1;
}
}
}
details_map
@ -372,21 +297,15 @@ impl Rank {
pub fn global_score(details: impl Iterator<Item = Self>) -> f64 {
let mut rank = Rank { rank: 1, max_rank: 1 };
for inner_rank in details {
rank = Rank::merge(rank, inner_rank);
rank.rank -= 1;
rank.rank *= inner_rank.max_rank;
rank.max_rank *= inner_rank.max_rank;
rank.rank += inner_rank.rank;
}
rank.local_score()
}
pub fn merge(mut outer: Rank, inner: Rank) -> Rank {
outer.rank = outer.rank.saturating_sub(1);
outer.rank *= inner.max_rank;
outer.max_rank *= inner.max_rank;
outer.rank += inner.rank;
outer
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
@ -416,78 +335,13 @@ pub struct Sort {
pub value: serde_json::Value,
}
impl PartialOrd for Sort {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
if self.field_name != other.field_name {
return None;
}
if self.ascending != other.ascending {
return None;
}
match (&self.value, &other.value) {
(serde_json::Value::Null, serde_json::Value::Null) => Some(Ordering::Equal),
(serde_json::Value::Null, _) => Some(Ordering::Less),
(_, serde_json::Value::Null) => Some(Ordering::Greater),
// numbers are always before strings
(serde_json::Value::Number(_), serde_json::Value::String(_)) => Some(Ordering::Greater),
(serde_json::Value::String(_), serde_json::Value::Number(_)) => Some(Ordering::Less),
(serde_json::Value::Number(left), serde_json::Value::Number(right)) => {
// FIXME: unwrap permitted here?
let order = left.as_f64().unwrap().partial_cmp(&right.as_f64().unwrap())?;
// 12 < 42, and when ascending, we want to see 12 first, so the smallest.
// Hence, when ascending, smaller is better
Some(if self.ascending { order.reverse() } else { order })
}
(serde_json::Value::String(left), serde_json::Value::String(right)) => {
let order = left.cmp(right);
// Taking e.g. "a" and "z"
// "a" < "z", and when ascending, we want to see "a" first, so the smallest.
// Hence, when ascending, smaller is better
Some(if self.ascending { order.reverse() } else { order })
}
_ => None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
pub struct GeoSort {
pub target_point: [f64; 2],
pub ascending: bool,
pub value: Option<[f64; 2]>,
}
impl PartialOrd for GeoSort {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
if self.target_point != other.target_point {
return None;
}
if self.ascending != other.ascending {
return None;
}
Some(match (self.distance(), other.distance()) {
(None, None) => Ordering::Equal,
(None, Some(_)) => Ordering::Less,
(Some(_), None) => Ordering::Greater,
(Some(left), Some(right)) => {
let order = left.partial_cmp(&right)?;
if self.ascending {
// when ascending, the one with the smallest distance has the best score
order.reverse()
} else {
order
}
}
})
}
}
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub struct Vector {
pub target_vector: Vec<f32>,
pub value_similarity: Option<(Vec<f32>, f32)>,
}
impl GeoSort {
pub fn distance(&self) -> Option<f64> {
self.value.map(|value| distance_between_two_points(&self.target_point, &value))

View File

@ -2,7 +2,7 @@ use std::collections::{BTreeMap, HashMap, HashSet};
use std::ops::ControlFlow;
use std::{fmt, mem};
use heed::types::Bytes;
use heed::types::ByteSlice;
use heed::BytesDecode;
use indexmap::IndexMap;
use roaring::RoaringBitmap;
@ -13,7 +13,7 @@ use crate::facet::FacetType;
use crate::heed_codec::facet::{
FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec,
};
use crate::heed_codec::{BytesRefCodec, StrRefCodec};
use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec};
use crate::search::facet::facet_distribution_iter::{
count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution,
};
@ -105,7 +105,7 @@ impl<'a> FacetDistribution<'a> {
key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db
.remap_key_type::<Bytes>()
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &key_buffer)?
.remap_key_type::<FieldDocIdFacetF64Codec>();
@ -129,7 +129,7 @@ impl<'a> FacetDistribution<'a> {
key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db
.remap_key_type::<Bytes>()
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &key_buffer)?
.remap_key_type::<FieldDocIdFacetStringCodec>();
@ -172,7 +172,9 @@ impl<'a> FacetDistribution<'a> {
search_function(
self.rtxn,
self.index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
self.index
.facet_id_f64_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
candidates,
|facet_key, nbr_docids, _| {
@ -201,7 +203,9 @@ impl<'a> FacetDistribution<'a> {
search_function(
self.rtxn,
self.index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
self.index
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
candidates,
|facet_key, nbr_docids, any_docid| {

View File

@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::BytesRefCodec;
use crate::heed_codec::ByteSliceRefCodec;
use crate::DocumentId;
/// Call the given closure on the facet distribution of the candidate documents.
@ -23,7 +23,7 @@ use crate::DocumentId;
/// keep iterating over the different facet values or stop.
pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
callback: CB,
@ -34,11 +34,11 @@ where
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
Ok(())
} else {
@ -48,7 +48,7 @@ where
pub fn count_iterate_over_facet_distribution<'t, CB>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
mut callback: CB,
@ -77,11 +77,11 @@ where
let mut heap = BinaryHeap::new();
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level
let starting_key =
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
@ -146,7 +146,7 @@ where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
callback: CB,
}

View File

@ -5,7 +5,7 @@ use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::BytesRefCodec;
use crate::heed_codec::ByteSliceRefCodec;
use crate::Result;
/// Find all the document ids for which the given field contains a value contained within
@ -25,11 +25,11 @@ where
let inner;
let left = match left {
Bound::Included(left) => {
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?;
Bound::Included(inner.as_ref())
}
Bound::Excluded(left) => {
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?;
Bound::Excluded(inner.as_ref())
}
Bound::Unbounded => Bound::Unbounded,
@ -37,22 +37,25 @@ where
let inner;
let right = match right {
Bound::Included(right) => {
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?;
Bound::Included(inner.as_ref())
}
Bound::Excluded(right) => {
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?;
Bound::Excluded(inner.as_ref())
}
Bound::Unbounded => Bound::Unbounded,
};
let db = db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let db = db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids };
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(starting_left_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
let rightmost_bound =
Bound::Included(get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
if let Some(starting_left_bound) =
get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)?
{
let rightmost_bound = Bound::Included(
get_last_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)?.unwrap(),
); // will not fail because get_first_facet_value succeeded
let group_size = usize::MAX;
f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?;
Ok(())
@ -64,7 +67,7 @@ where
/// Fetch the document ids that have a facet with a value between the two given bounds
struct FacetRangeSearch<'t, 'b, 'bitmap> {
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
left: Bound<&'b [u8]>,
right: Bound<&'b [u8]>,

View File

@ -5,7 +5,7 @@ use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::heed_codec::ByteSliceRefCodec;
/// Return an iterator which iterates over the given candidate documents in
/// ascending order of their facet value for the given field id.
@ -31,12 +31,12 @@ use crate::heed_codec::BytesRefCodec;
/// Note that once a document id is returned by the iterator, it is never returned again.
pub fn ascending_facet_sort<'t>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
@ -53,12 +53,14 @@ pub fn ascending_facet_sort<'t>(
struct AscendingFacetSort<'t, 'e> {
rtxn: &'t heed::RoTxn<'e>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
#[allow(clippy::type_complexity)]
stack: Vec<(
RoaringBitmap,
std::iter::Take<heed::RoRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>>,
std::iter::Take<
heed::RoRange<'t, FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
>,
)>,
}

View File

@ -7,21 +7,21 @@ use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::heed_codec::ByteSliceRefCodec;
/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort).
///
/// This function does the same thing, but in the opposite order.
pub fn descending_facet_sort<'t>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let last_bound = get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap();
let last_bound = get_last_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)?.unwrap();
let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound };
let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
Ok(itertools::Either::Left(DescendingFacetSort {
@ -37,13 +37,13 @@ pub fn descending_facet_sort<'t>(
struct DescendingFacetSort<'t> {
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
#[allow(clippy::type_complexity)]
stack: Vec<(
RoaringBitmap,
std::iter::Take<
heed::RoRevRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
heed::RoRevRange<'t, FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
>,
Bound<&'t [u8]>,
)>,
@ -100,7 +100,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> {
*right_bound = Bound::Excluded(left_bound);
let iter = match self
.db
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
.rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow))
{
Ok(iter) => iter,
@ -123,7 +123,7 @@ mod tests {
use roaring::RoaringBitmap;
use crate::heed_codec::facet::FacetGroupKeyCodec;
use crate::heed_codec::BytesRefCodec;
use crate::heed_codec::ByteSliceRefCodec;
use crate::milli_snap;
use crate::search::facet::facet_sort_descending::descending_facet_sort;
use crate::search::facet::tests::{
@ -144,7 +144,7 @@ mod tests {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
@ -167,7 +167,7 @@ mod tests {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap();
for el in iter {
let (docids, _) = el.unwrap();

View File

@ -1,13 +1,13 @@
pub use facet_sort_ascending::ascending_facet_sort;
pub use facet_sort_descending::descending_facet_sort;
use heed::types::{Bytes, DecodeIgnore};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesDecode, RoTxn};
use roaring::RoaringBitmap;
pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::filter::{BadGeoError, Filter};
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec};
use crate::heed_codec::BytesRefCodec;
use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result};
mod facet_distribution;
mod facet_distribution_iter;
@ -22,10 +22,8 @@ fn facet_extreme_value<'t>(
let extreme_value =
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
let (_, extreme_value) = extreme_value?;
OrderedF64Codec::bytes_decode(extreme_value)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into)
Ok(OrderedF64Codec::bytes_decode(extreme_value))
}
pub fn facet_min_value<'t>(
@ -34,7 +32,7 @@ pub fn facet_min_value<'t>(
field_id: u16,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
@ -45,7 +43,7 @@ pub fn facet_max_value<'t>(
field_id: u16,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
@ -53,7 +51,7 @@ pub fn facet_max_value<'t>(
/// Get the first facet value in the facet database
pub(crate) fn get_first_facet_value<'t, BoundCodec>(
txn: &'t RoTxn,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>>
where
@ -62,12 +60,13 @@ where
let mut level0prefix = vec![];
level0prefix.extend_from_slice(&field_id.to_be_bytes());
level0prefix.push(0);
let mut level0_iter_forward =
db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, level0prefix.as_slice())?;
let mut level0_iter_forward = db
.as_polymorph()
.prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?;
if let Some(first) = level0_iter_forward.next() {
let (first_key, _) = first?;
let first_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(first_key)
.map_err(heed::Error::Decoding)?;
.ok_or(heed::Error::Encoding)?;
Ok(Some(first_key.left_bound))
} else {
Ok(None)
@ -77,7 +76,7 @@ where
/// Get the last facet value in the facet database
pub(crate) fn get_last_facet_value<'t, BoundCodec>(
txn: &'t RoTxn,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>>
where
@ -86,12 +85,13 @@ where
let mut level0prefix = vec![];
level0prefix.extend_from_slice(&field_id.to_be_bytes());
level0prefix.push(0);
let mut level0_iter_backward =
db.remap_types::<Bytes, DecodeIgnore>().rev_prefix_iter(txn, level0prefix.as_slice())?;
let mut level0_iter_backward = db
.as_polymorph()
.rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?;
if let Some(last) = level0_iter_backward.next() {
let (last_key, _) = last?;
let last_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(last_key)
.map_err(heed::Error::Decoding)?;
.ok_or(heed::Error::Encoding)?;
Ok(Some(last_key.left_bound))
} else {
Ok(None)
@ -101,17 +101,17 @@ where
/// Get the height of the highest level in the facet database
pub(crate) fn get_highest_level<'t>(
txn: &'t RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
) -> heed::Result<u8> {
let field_id_prefix = &field_id.to_be_bytes();
Ok(db
.remap_types::<Bytes, DecodeIgnore>()
.rev_prefix_iter(txn, field_id_prefix)?
.as_polymorph()
.rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, field_id_prefix)?
.next()
.map(|el| {
let (key, _) = el.unwrap();
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap();
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key).unwrap();
key.level
})
.unwrap_or(0))

View File

@ -1,183 +0,0 @@
use std::cmp::Ordering;
use itertools::Itertools;
use roaring::RoaringBitmap;
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
use crate::{MatchingWords, Result, Search, SearchResult};
struct ScoreWithRatioResult {
matching_words: MatchingWords,
candidates: RoaringBitmap,
document_scores: Vec<(u32, ScoreWithRatio)>,
}
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
fn compare_scores(
&(ref left_scores, left_ratio): &ScoreWithRatio,
&(ref right_scores, right_ratio): &ScoreWithRatio,
) -> Ordering {
let mut left_it = ScoreDetails::score_values(left_scores.iter());
let mut right_it = ScoreDetails::score_values(right_scores.iter());
loop {
let left = left_it.next();
let right = right_it.next();
match (left, right) {
(None, None) => return Ordering::Equal,
(None, Some(_)) => return Ordering::Less,
(Some(_), None) => return Ordering::Greater,
(Some(ScoreValue::Score(left)), Some(ScoreValue::Score(right))) => {
let left = left * left_ratio as f64;
let right = right * right_ratio as f64;
if (left - right).abs() <= f64::EPSILON {
continue;
}
return left.partial_cmp(&right).unwrap();
}
(Some(ScoreValue::Sort(left)), Some(ScoreValue::Sort(right))) => {
match left.partial_cmp(right).unwrap() {
Ordering::Equal => continue,
order => return order,
}
}
(Some(ScoreValue::GeoSort(left)), Some(ScoreValue::GeoSort(right))) => {
match left.partial_cmp(right).unwrap() {
Ordering::Equal => continue,
order => return order,
}
}
(Some(ScoreValue::Score(_)), Some(_)) => return Ordering::Greater,
(Some(_), Some(ScoreValue::Score(_))) => return Ordering::Less,
// if we have this, we're bad
(Some(ScoreValue::GeoSort(_)), Some(ScoreValue::Sort(_)))
| (Some(ScoreValue::Sort(_)), Some(ScoreValue::GeoSort(_))) => {
unreachable!("Unexpected geo and sort comparison")
}
}
}
}
impl ScoreWithRatioResult {
fn new(results: SearchResult, ratio: f32) -> Self {
let document_scores = results
.documents_ids
.into_iter()
.zip(results.document_scores.into_iter().map(|scores| (scores, ratio)))
.collect();
Self {
matching_words: results.matching_words,
candidates: results.candidates,
document_scores,
}
}
fn merge(left: Self, right: Self, from: usize, length: usize) -> SearchResult {
let mut documents_ids =
Vec::with_capacity(left.document_scores.len() + right.document_scores.len());
let mut document_scores =
Vec::with_capacity(left.document_scores.len() + right.document_scores.len());
let mut documents_seen = RoaringBitmap::new();
for (docid, (main_score, _sub_score)) in left
.document_scores
.into_iter()
.merge_by(right.document_scores.into_iter(), |(_, left), (_, right)| {
// the first value is the one with the greatest score
compare_scores(left, right).is_ge()
})
// remove documents we already saw
.filter(|(docid, _)| documents_seen.insert(*docid))
// start skipping **after** the filter
.skip(from)
// take **after** skipping
.take(length)
{
documents_ids.push(docid);
// TODO: pass both scores to documents_score in some way?
document_scores.push(main_score);
}
SearchResult {
matching_words: left.matching_words,
candidates: left.candidates | right.candidates,
documents_ids,
document_scores,
}
}
}
impl<'a> Search<'a> {
pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result<SearchResult> {
// TODO: find classier way to achieve that than to reset vector and query params
// create separate keyword and semantic searches
let mut search = Search {
query: self.query.clone(),
vector: self.vector.clone(),
filter: self.filter.clone(),
offset: 0,
limit: self.limit + self.offset,
sort_criteria: self.sort_criteria.clone(),
searchable_attributes: self.searchable_attributes,
geo_strategy: self.geo_strategy,
terms_matching_strategy: self.terms_matching_strategy,
scoring_strategy: ScoringStrategy::Detailed,
words_limit: self.words_limit,
exhaustive_number_hits: self.exhaustive_number_hits,
rtxn: self.rtxn,
index: self.index,
distribution_shift: self.distribution_shift,
embedder_name: self.embedder_name.clone(),
};
let vector_query = search.vector.take();
let keyword_results = search.execute()?;
// skip semantic search if we don't have a vector query (placeholder search)
let Some(vector_query) = vector_query else {
return Ok(keyword_results);
};
// completely skip semantic search if the results of the keyword search are good enough
if self.results_good_enough(&keyword_results, semantic_ratio) {
return Ok(keyword_results);
}
search.vector = Some(vector_query);
search.query = None;
// TODO: would be better to have two distinct functions at this point
let vector_results = search.execute()?;
let keyword_results = ScoreWithRatioResult::new(keyword_results, 1.0 - semantic_ratio);
let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio);
let merge_results =
ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit);
assert!(merge_results.documents_ids.len() <= self.limit);
Ok(merge_results)
}
fn results_good_enough(&self, keyword_results: &SearchResult, semantic_ratio: f32) -> bool {
// A result is good enough if its keyword score is > 0.9 with a semantic ratio of 0.5 => 0.9 * 0.5
const GOOD_ENOUGH_SCORE: f64 = 0.45;
// 1. we check that we got a sufficient number of results
if keyword_results.document_scores.len() < self.limit + self.offset {
return false;
}
// 2. and that all results have a good enough score.
// we need to check all results because due to sort like rules, they're not necessarily in relevancy order
for score in &keyword_results.document_scores {
let score = ScoreDetails::global_score(score.iter());
if score * ((1.0 - semantic_ratio) as f64) < GOOD_ENOUGH_SCORE {
return false;
}
}
true
}
}

View File

@ -12,14 +12,13 @@ use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
use self::new::{execute_vector_search, PartialSearchResult};
use self::new::PartialSearchResult;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::vector::DistributionShift;
use crate::{
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index,
Result, SearchContext,
execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result,
SearchContext, BEU16,
};
// Building these factories is not free.
@ -27,12 +26,11 @@ static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
/// The maximum number of values per facet returned by the facet search route.
const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100;
/// The maximum number of facets returned by the facet search route.
const MAX_NUMBER_OF_FACETS: usize = 100;
pub mod facet;
mod fst_utils;
pub mod hybrid;
pub mod new;
pub struct Search<'a> {
@ -49,11 +47,8 @@ pub struct Search<'a> {
scoring_strategy: ScoringStrategy,
words_limit: usize,
exhaustive_number_hits: bool,
/// TODO: Add semantic ratio or pass it directly to execute_hybrid()
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
distribution_shift: Option<DistributionShift>,
embedder_name: Option<String>,
}
impl<'a> Search<'a> {
@ -73,8 +68,6 @@ impl<'a> Search<'a> {
words_limit: 10,
rtxn,
index,
distribution_shift: None,
embedder_name: None,
}
}
@ -83,8 +76,8 @@ impl<'a> Search<'a> {
self
}
pub fn vector(&mut self, vector: Vec<f32>) -> &mut Search<'a> {
self.vector = Some(vector);
pub fn vector(&mut self, vector: impl Into<Vec<f32>>) -> &mut Search<'a> {
self.vector = Some(vector.into());
self
}
@ -141,75 +134,30 @@ impl<'a> Search<'a> {
self
}
pub fn distribution_shift(
&mut self,
distribution_shift: Option<DistributionShift>,
) -> &mut Search<'a> {
self.distribution_shift = distribution_shift;
self
}
pub fn embedder_name(&mut self, embedder_name: impl Into<String>) -> &mut Search<'a> {
self.embedder_name = Some(embedder_name.into());
self
}
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
if has_vector_search {
let ctx = SearchContext::new(self.index, self.rtxn);
filtered_universe(&ctx, &self.filter)
} else {
Ok(self.execute()?.candidates)
}
}
pub fn execute(&self) -> Result<SearchResult> {
let embedder_name;
let embedder_name = match &self.embedder_name {
Some(embedder_name) => embedder_name,
None => {
embedder_name = self.index.default_embedding_name(self.rtxn)?;
&embedder_name
}
};
let mut ctx = SearchContext::new(self.index, self.rtxn);
if let Some(searchable_attributes) = self.searchable_attributes {
ctx.searchable_attributes(searchable_attributes)?;
}
let universe = filtered_universe(&ctx, &self.filter)?;
let PartialSearchResult { located_query_terms, candidates, documents_ids, document_scores } =
match self.vector.as_ref() {
Some(vector) => execute_vector_search(
&mut ctx,
vector,
self.scoring_strategy,
universe,
&self.sort_criteria,
self.geo_strategy,
self.offset,
self.limit,
self.distribution_shift,
embedder_name,
)?,
None => execute_search(
&mut ctx,
self.query.as_deref(),
self.terms_matching_strategy,
self.scoring_strategy,
self.exhaustive_number_hits,
universe,
&self.sort_criteria,
self.geo_strategy,
self.offset,
self.limit,
Some(self.words_limit),
&mut DefaultSearchLogger,
&mut DefaultSearchLogger,
)?,
};
execute_search(
&mut ctx,
&self.query,
&self.vector,
self.terms_matching_strategy,
self.scoring_strategy,
self.exhaustive_number_hits,
&self.filter,
&self.sort_criteria,
self.geo_strategy,
self.offset,
self.limit,
Some(self.words_limit),
&mut DefaultSearchLogger,
&mut DefaultSearchLogger,
)?;
// consume context and located_query_terms to build MatchingWords.
let matching_words = match located_query_terms {
@ -238,8 +186,6 @@ impl fmt::Debug for Search<'_> {
exhaustive_number_hits,
rtxn: _,
index: _,
distribution_shift,
embedder_name,
} = self;
f.debug_struct("Search")
.field("query", query)
@ -253,8 +199,6 @@ impl fmt::Debug for Search<'_> {
.field("scoring_strategy", scoring_strategy)
.field("exhaustive_number_hits", exhaustive_number_hits)
.field("words_limit", words_limit)
.field("distribution_shift", distribution_shift)
.field("embedder_name", embedder_name)
.finish()
}
}
@ -306,23 +250,11 @@ pub struct SearchForFacetValues<'a> {
query: Option<String>,
facet: String,
search_query: Search<'a>,
max_values: usize,
is_hybrid: bool,
}
impl<'a> SearchForFacetValues<'a> {
pub fn new(
facet: String,
search_query: Search<'a>,
is_hybrid: bool,
) -> SearchForFacetValues<'a> {
SearchForFacetValues {
query: None,
facet,
search_query,
max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET,
is_hybrid,
}
pub fn new(facet: String, search_query: Search<'a>) -> SearchForFacetValues<'a> {
SearchForFacetValues { query: None, facet, search_query }
}
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
@ -330,11 +262,6 @@ impl<'a> SearchForFacetValues<'a> {
self
}
pub fn max_values(&mut self, max: usize) -> &mut Self {
self.max_values = max;
self
}
fn one_original_value_of(
&self,
field_id: FieldId,
@ -372,14 +299,12 @@ impl<'a> SearchForFacetValues<'a> {
None => return Ok(Vec::new()),
};
let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? {
let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &BEU16::new(fid))? {
Some(fst) => fst,
None => return Ok(vec![]),
};
let search_candidates = self
.search_query
.execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?;
let search_candidates = self.search_query.execute()?.candidates;
match self.query.as_ref() {
Some(query) => {
@ -474,7 +399,7 @@ impl<'a> SearchForFacetValues<'a> {
.unwrap_or_else(|| left_bound.to_string());
results.push(FacetValueHit { value, count });
}
if results.len() >= self.max_values {
if results.len() >= MAX_NUMBER_OF_FACETS {
break;
}
}
@ -519,7 +444,7 @@ impl<'a> SearchForFacetValues<'a> {
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
}
if results.len() >= self.max_values {
if results.len() >= MAX_NUMBER_OF_FACETS {
return Ok(ControlFlow::Break(()));
}
}

View File

@ -15,7 +15,6 @@ pub struct BucketSortOutput {
// TODO: would probably be good to regroup some of these inside of a struct?
#[allow(clippy::too_many_arguments)]
#[logging_timer::time]
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>,
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,

View File

@ -3,14 +3,13 @@ use std::collections::hash_map::Entry;
use std::hash::Hash;
use fxhash::FxHashMap;
use heed::types::Bytes;
use heed::types::ByteSlice;
use heed::{BytesEncode, Database, RoTxn};
use roaring::RoaringBitmap;
use super::interner::Interned;
use super::Word;
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::proximity::ProximityPrecision;
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
@ -51,7 +50,7 @@ impl<'ctx> DatabaseCache<'ctx> {
cache_key: K1,
db_key: &'v KC::EItem,
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
db: Database<KC, Bytes>,
db: Database<KC, ByteSlice>,
) -> Result<Option<DC::DItem>>
where
K1: Copy + Eq + Hash,
@ -64,14 +63,12 @@ impl<'ctx> DatabaseCache<'ctx> {
}
match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
Some(Cow::Borrowed(bytes)) => {
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
}
Some(Cow::Owned(bytes)) => {
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
}
None => Ok(None),
}
}
@ -81,7 +78,7 @@ impl<'ctx> DatabaseCache<'ctx> {
cache_key: K1,
db_keys: &'v [KC::EItem],
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
db: Database<KC, Bytes>,
db: Database<KC, ByteSlice>,
merger: MergeFn,
) -> Result<Option<DC::DItem>>
where
@ -113,14 +110,12 @@ impl<'ctx> DatabaseCache<'ctx> {
}
match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
Some(Cow::Borrowed(bytes)) => {
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
}
Some(Cow::Owned(bytes)) => {
DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some)
}
None => Ok(None),
}
}
@ -162,15 +157,14 @@ impl<'ctx> SearchContext<'ctx> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect();
let keys: Vec<_> = restricted_fids.iter().map(|fid| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
&keys[..],
&mut self.db_cache.word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
merge_cbo_roaring_bitmaps,
)
}
@ -179,7 +173,7 @@ impl<'ctx> SearchContext<'ctx> {
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids,
self.index.word_docids.remap_data_type::<Bytes>(),
self.index.word_docids.remap_data_type::<ByteSlice>(),
),
}
}
@ -188,29 +182,13 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
&keys[..],
&mut self.db_cache.exact_word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.exact_word_docids,
self.index.exact_word_docids.remap_data_type::<Bytes>(),
),
}
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.exact_word_docids,
self.index.exact_word_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
@ -241,15 +219,14 @@ impl<'ctx> SearchContext<'ctx> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect();
let keys: Vec<_> = restricted_fids.iter().map(|fid| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(),
merge_cbo_roaring_bitmaps,
)
}
@ -258,7 +235,7 @@ impl<'ctx> SearchContext<'ctx> {
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_docids.remap_data_type::<Bytes>(),
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
),
}
}
@ -267,29 +244,13 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.exact_word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.exact_word_prefix_docids,
self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(),
),
}
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.exact_word_prefix_docids,
self.index.exact_word_prefix_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_db_word_pair_proximity_docids(
@ -298,65 +259,17 @@ impl<'ctx> SearchContext<'ctx> {
word2: Interned<String>,
proximity: u8,
) -> Result<Option<RoaringBitmap>> {
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
ProximityPrecision::ByAttribute => {
// Force proximity to 0 because:
// in ByAttribute, there are only 2 possible distances:
// 1. words in same attribute: in that the DB contains (0, word1, word2)
// 2. words in different attributes: no DB entry for these two words.
let proximity = 0;
let docids = if let Some(docids) =
self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2))
{
docids
.as_ref()
.map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d))
.transpose()
.map_err(heed::Error::Decoding)?
} else {
// Compute the distance at the attribute level and store it in the cache.
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? {
fids
} else {
self.index.fields_ids_map(self.txn)?.ids().collect()
};
let mut docids = RoaringBitmap::new();
for fid in fids {
// for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache.
let word1_docids = self.get_db_word_fid_docids(word1, fid)?;
let word2_docids = self.get_db_word_fid_docids(word2, fid)?;
if let (Some(word1_docids), Some(word2_docids)) =
(word1_docids, word2_docids)
{
docids |= word1_docids & word2_docids;
}
}
let encoded = CboRoaringBitmapCodec::bytes_encode(&docids)
.map(Cow::into_owned)
.map(Cow::Owned)
.map(Some)
.map_err(heed::Error::Decoding)?;
self.db_cache
.word_pair_proximity_docids
.insert((proximity, word1, word2), encoded);
Some(docids)
};
Ok(docids)
}
ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
}
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_db_word_pair_proximity_docids_len(
@ -365,95 +278,54 @@ impl<'ctx> SearchContext<'ctx> {
word2: Interned<String>,
proximity: u8,
) -> Result<Option<u64>> {
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
ProximityPrecision::ByAttribute => Ok(self
.get_db_word_pair_proximity_docids(word1, word2, proximity)?
.map(|d| d.len())),
ProximityPrecision::ByWord => {
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
)
}
}
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_db_word_prefix_pair_proximity_docids(
&mut self,
word1: Interned<String>,
prefix2: Interned<String>,
mut proximity: u8,
proximity: u8,
) -> Result<Option<RoaringBitmap>> {
let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default();
if proximity_precision == ProximityPrecision::ByAttribute {
// Force proximity to 0 because:
// in ByAttribute, there are only 2 possible distances:
// 1. words in same attribute: in that the DB contains (0, word1, word2)
// 2. words in different attributes: no DB entry for these two words.
proximity = 0;
}
let docids = if let Some(docids) =
self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2))
let docids = match self
.db_cache
.word_prefix_pair_proximity_docids
.entry((proximity, word1, prefix2))
{
docids.clone()
} else {
let prefix_docids = match proximity_precision {
ProximityPrecision::ByAttribute => {
// Compute the distance at the attribute level and store it in the cache.
let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? {
fids
} else {
self.index.fields_ids_map(self.txn)?.ids().collect()
};
let mut prefix_docids = RoaringBitmap::new();
// for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache.
for fid in fids {
let word1_docids = self.get_db_word_fid_docids(word1, fid)?;
let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?;
if let (Some(word1_docids), Some(prefix2_docids)) =
(word1_docids, prefix2_docids)
{
prefix_docids |= word1_docids & prefix2_docids;
}
}
prefix_docids
}
ProximityPrecision::ByWord => {
// compute docids using prefix iter and store the result in the cache.
let key = U8StrStrCodec::bytes_encode(&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(),
))
.unwrap()
.into_owned();
let mut prefix_docids = RoaringBitmap::new();
let remap_key_type = self
.index
.word_pair_proximity_docids
.remap_key_type::<Bytes>()
.prefix_iter(self.txn, &key)?;
for result in remap_key_type {
let (_, docids) = result?;
Entry::Occupied(docids) => docids.get().clone(),
Entry::Vacant(entry) => {
// compute docids using prefix iter and store the result in the cache.
let key = U8StrStrCodec::bytes_encode(&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(),
))
.unwrap()
.into_owned();
let mut prefix_docids = RoaringBitmap::new();
let remap_key_type = self
.index
.word_pair_proximity_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(self.txn, &key)?;
for result in remap_key_type {
let (_, docids) = result?;
prefix_docids |= docids;
}
prefix_docids
prefix_docids |= docids;
}
};
self.db_cache
.word_prefix_pair_proximity_docids
.insert((proximity, word1, prefix2), Some(prefix_docids.clone()));
Some(prefix_docids)
entry.insert(Some(prefix_docids.clone()));
Some(prefix_docids)
}
};
Ok(docids)
}
@ -483,7 +355,7 @@ impl<'ctx> SearchContext<'ctx> {
(word, fid),
&(self.word_interner.get(word).as_str(), fid),
&mut self.db_cache.word_fid_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
)
}
@ -502,7 +374,7 @@ impl<'ctx> SearchContext<'ctx> {
(word_prefix, fid),
&(self.word_interner.get(word_prefix).as_str(), fid),
&mut self.db_cache.word_prefix_fid_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(),
)
}
@ -516,7 +388,7 @@ impl<'ctx> SearchContext<'ctx> {
let remap_key_type = self
.index
.word_fid_docids
.remap_types::<Bytes, Bytes>()
.remap_types::<ByteSlice, ByteSlice>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
@ -542,7 +414,7 @@ impl<'ctx> SearchContext<'ctx> {
let remap_key_type = self
.index
.word_prefix_fid_docids
.remap_types::<Bytes, Bytes>()
.remap_types::<ByteSlice, ByteSlice>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
@ -570,7 +442,7 @@ impl<'ctx> SearchContext<'ctx> {
(word, position),
&(self.word_interner.get(word).as_str(), position),
&mut self.db_cache.word_position_docids,
self.index.word_position_docids.remap_data_type::<Bytes>(),
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
)
}
@ -584,7 +456,7 @@ impl<'ctx> SearchContext<'ctx> {
(word_prefix, position),
&(self.word_interner.get(word_prefix).as_str(), position),
&mut self.db_cache.word_prefix_position_docids,
self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
self.index.word_prefix_position_docids.remap_data_type::<ByteSlice>(),
)
}
@ -598,7 +470,7 @@ impl<'ctx> SearchContext<'ctx> {
let remap_key_type = self
.index
.word_position_docids
.remap_types::<Bytes, Bytes>()
.remap_types::<ByteSlice, ByteSlice>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
@ -629,7 +501,7 @@ impl<'ctx> SearchContext<'ctx> {
let remap_key_type = self
.index
.word_prefix_position_docids
.remap_types::<Bytes, Bytes>()
.remap_types::<ByteSlice, ByteSlice>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {

View File

@ -1,4 +1,4 @@
use heed::types::{Bytes, Str, Unit};
use heed::types::{ByteSlice, Str, Unit};
use heed::{Database, RoPrefix, RoTxn};
use roaring::RoaringBitmap;
@ -8,7 +8,7 @@ const DOCID_SIZE: usize = 4;
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result, SearchContext};
pub struct DistinctOutput {
@ -71,7 +71,7 @@ pub fn distinct_single_docid(
/// Return all the docids containing the given value in the given field
fn facet_value_docids(
database: Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
database: Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
txn: &RoTxn,
field_id: u16,
facet_value: &[u8],
@ -87,12 +87,12 @@ fn facet_number_values<'a>(
field_id: u16,
index: &Index,
txn: &'a RoTxn,
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Unit>> {
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<ByteSliceRefCodec>, Unit>> {
let key = facet_values_prefix_key(field_id, docid);
let iter = index
.field_id_docid_facet_f64s
.remap_key_type::<Bytes>()
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type();
@ -105,12 +105,12 @@ pub fn facet_string_values<'a>(
field_id: u16,
index: &Index,
txn: &'a RoTxn,
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Str>> {
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<ByteSliceRefCodec>, Str>> {
let key = facet_values_prefix_key(field_id, docid);
let iter = index
.field_id_docid_facet_strings
.remap_key_type::<Bytes>()
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_types();

View File

@ -1,7 +1,7 @@
use std::collections::VecDeque;
use std::iter::FromIterator;
use heed::types::{Bytes, Unit};
use heed::types::{ByteSlice, Unit};
use heed::{RoPrefix, RoTxn};
use roaring::RoaringBitmap;
use rstar::RTree;
@ -34,7 +34,7 @@ fn facet_number_values<'a>(
let iter = index
.field_id_docid_facet_f64s
.remap_key_type::<Bytes>()
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type();
@ -107,16 +107,12 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
/// Refill the internal buffer of cached docids based on the strategy.
/// Drop the rtree if we don't need it anymore.
fn fill_buffer(
&mut self,
ctx: &mut SearchContext,
geo_candidates: &RoaringBitmap,
) -> Result<()> {
fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> {
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
debug_assert!(self.cached_sorted_docids.is_empty());
// lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree`
let rtree = if self.strategy.use_rtree(geo_candidates.len() as usize) {
let rtree = if self.strategy.use_rtree(self.geo_candidates.len() as usize) {
if let Some(rtree) = self.rtree.as_ref() {
// get rtree from cache
Some(rtree)
@ -135,7 +131,7 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
if self.ascending {
let point = lat_lng_to_xyz(&self.point);
for point in rtree.nearest_neighbor_iter(&point) {
if geo_candidates.contains(point.data.0) {
if self.geo_candidates.contains(point.data.0) {
self.cached_sorted_docids.push_back(point.data);
if self.cached_sorted_docids.len() >= cache_size {
break;
@ -147,7 +143,7 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
// and we insert the points in reverse order they get reversed when emptying the cache later on
let point = lat_lng_to_xyz(&opposite_of(self.point));
for point in rtree.nearest_neighbor_iter(&point) {
if geo_candidates.contains(point.data.0) {
if self.geo_candidates.contains(point.data.0) {
self.cached_sorted_docids.push_front(point.data);
if self.cached_sorted_docids.len() >= cache_size {
break;
@ -159,14 +155,15 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
// the iterative version
let [lat, lng] = self.field_ids.unwrap();
let mut documents = geo_candidates
let mut documents = self
.geo_candidates
.iter()
.map(|id| -> Result<_> { Ok((id, geo_value(id, lat, lng, ctx.index, ctx.txn)?)) })
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
// computing the distance between two points is expensive thus we cache the result
documents
.sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
self.cached_sorted_docids.extend(documents);
self.cached_sorted_docids.extend(documents.into_iter());
};
Ok(())
@ -219,10 +216,9 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
assert!(self.query.is_none());
self.query = Some(query.clone());
self.geo_candidates &= universe;
let geo_candidates = &self.geo_candidates & universe;
if geo_candidates.is_empty() {
if self.geo_candidates.is_empty() {
return Ok(());
}
@ -230,7 +226,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
self.field_ids = Some([lat, lng]);
self.fill_buffer(ctx, &geo_candidates)?;
self.fill_buffer(ctx)?;
Ok(())
}
@ -242,10 +238,9 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Q>>> {
let query = self.query.as_ref().unwrap().clone();
self.geo_candidates &= universe;
let geo_candidates = &self.geo_candidates & universe;
if geo_candidates.is_empty() {
if self.geo_candidates.is_empty() {
return Ok(Some(RankingRuleOutput {
query,
candidates: universe.clone(),
@ -266,7 +261,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
}
};
while let Some((id, point)) = next(&mut self.cached_sorted_docids) {
if geo_candidates.contains(id) {
if self.geo_candidates.contains(id) {
return Ok(Some(RankingRuleOutput {
query,
candidates: RoaringBitmap::from_iter([id]),
@ -281,7 +276,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
// if we got out of this loop it means we've exhausted our cache.
// we need to refill it and run the function again.
self.fill_buffer(ctx, &geo_candidates)?;
self.fill_buffer(ctx)?;
self.next_bucket(ctx, logger, universe)
}

Some files were not shown because too many files have changed in this diff Show More