Squash in a single commit and rebase

This commit is contained in:
Clément Renault
2024-07-01 16:38:52 +02:00
committed by Kerollmops
parent 0776217801
commit beef5b5f98
23 changed files with 729 additions and 388 deletions

397
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -150,7 +150,7 @@ fn main() {
// after executing a batch we check if the database is corrupted
let res = index.search(&wtxn).execute().unwrap();
index.documents(&wtxn, res.documents_ids).unwrap();
index.compressed_documents(&wtxn, res.documents_ids).unwrap();
progression.fetch_add(1, Ordering::Relaxed);
}
wtxn.abort();

View File

@ -15,7 +15,7 @@ pub mod star_or;
pub mod task_view;
pub mod tasks;
pub mod versioning;
pub use milli::{heed, Index};
pub use milli::{heed, zstd, Index};
use uuid::Uuid;
pub use versioning::VERSION_FILE_NAME;
pub use {milli, serde_cs};

View File

@ -132,7 +132,7 @@ reqwest = { version = "0.12.12", features = [
sha-1 = { version = "0.10.1", optional = true }
static-files = { version = "0.2.4", optional = true }
tempfile = { version = "3.15.0", optional = true }
zip = { version = "2.2.2", optional = true }
zip = { version = "2.2.2", default-features = false, features = ["deflate"], optional = true }
[features]
default = ["meilisearch-types/all-tokenizations", "mini-dashboard"]

View File

@ -1411,43 +1411,50 @@ fn some_documents<'a, 't: 'a>(
retrieve_vectors: RetrieveVectors,
) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let dictionary = index.document_decompression_dictionary(rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index.embedding_configs(rtxn)?;
let mut buffer = Vec::new();
Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> {
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
match retrieve_vectors {
RetrieveVectors::Hide => {
document.remove("_vectors");
}
RetrieveVectors::Retrieve => {
// Clippy is simply wrong
#[allow(clippy::manual_unwrap_or_default)]
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, key)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(key));
let embeddings = ExplicitVectors {
embeddings: Some(vector.into()),
regenerate: !user_provided,
};
vectors.insert(
name,
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
);
Ok(index.iter_compressed_documents(rtxn, doc_ids)?.map(move |ret| {
ret.map_err(ResponseError::from).and_then(
|(key, compressed_document)| -> Result<_, ResponseError> {
let document = compressed_document
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())?;
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
match retrieve_vectors {
RetrieveVectors::Hide => {
document.remove("_vectors");
}
RetrieveVectors::Retrieve => {
// Clippy is simply wrong
#[allow(clippy::manual_unwrap_or_default)]
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, key)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(key));
let embeddings = ExplicitVectors {
embeddings: Some(vector.into()),
regenerate: !user_provided,
};
vectors.insert(
name,
serde_json::to_value(embeddings)
.map_err(MeilisearchHttpError::from)?,
);
}
document.insert("_vectors".into(), vectors.into());
}
document.insert("_vectors".into(), vectors.into());
}
}
Ok(document)
})
Ok(document)
},
)
}))
}

View File

@ -1330,6 +1330,20 @@ impl<'a> HitMaker<'a> {
let (_, obkv) =
self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?;
// let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build());
// formatter_builder.crop_marker(format.crop_marker);
// formatter_builder.highlight_prefix(format.highlight_pre_tag);
// formatter_builder.highlight_suffix(format.highlight_post_tag);
// let decompression_dictionary = index.document_decompression_dictionary(rtxn)?;
// let mut buffer = Vec::new();
// let mut documents = Vec::new();
// let embedding_configs = index.embedding_configs(rtxn)?;
// let documents_iter = index.compressed_documents(rtxn, documents_ids)?;
// for ((id, compressed), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
// let obkv = compressed
// .decompress_with_optional_dictionary(&mut buffer, decompression_dictionary.as_ref())
// // TODO use a better error?
// .map_err(|e| MeilisearchHttpError::HeedError(e.into()))?;
// First generate a document with all the displayed fields
let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?;

View File

@ -280,6 +280,7 @@ fn export_a_dump(
// 4. Dump the indexes
let mut count = 0;
let mut buffer = Vec::new();
for result in index_mapping.iter(&rtxn)? {
let (uid, uuid) = result?;
let index_path = db_path.join("indexes").join(uuid.to_string());
@ -288,6 +289,7 @@ fn export_a_dump(
})?;
let rtxn = index.read_txn()?;
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let metadata = IndexMetadata {
uid: uid.to_owned(),
primary_key: index.primary_key(&rtxn)?.map(String::from),
@ -300,8 +302,11 @@ fn export_a_dump(
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
// 4.1. Dump the documents
for ret in index.all_documents(&rtxn)? {
let (_id, doc) = ret?;
for ret in index.all_compressed_documents(&rtxn)? {
let (_id, compressed_doc) = ret?;
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
index_dumper.push_document(&document)?;
}

View File

@ -36,6 +36,7 @@ heed = { version = "0.20.5", default-features = false, features = [
indexmap = { version = "2.7.0", features = ["serde"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
zstd = { version = "0.13.1", features = ["zdict_builder", "experimental"] }
memchr = "2.7.4"
memmap2 = "0.9.5"
obkv = "0.3.0"

View File

@ -0,0 +1,98 @@
use std::borrow::Cow;
use std::io;
use std::io::ErrorKind;
use heed::BoxedError;
use obkv::KvReaderU16;
use zstd::bulk::{Compressor, Decompressor};
use zstd::dict::{DecoderDictionary, EncoderDictionary};
pub struct CompressedObkvCodec;
impl<'a> heed::BytesDecode<'a> for CompressedObkvCodec {
type DItem = CompressedKvReaderU16<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
Ok(CompressedKvReaderU16(bytes))
}
}
impl heed::BytesEncode<'_> for CompressedObkvCodec {
type EItem = CompressedKvWriterU16;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
Ok(Cow::Borrowed(&item.0))
}
}
// TODO Make this an unsized slice wrapper instead?
// &'a CompressedKvReaderU16([u8])
pub struct CompressedKvReaderU16<'a>(&'a [u8]);
impl<'a> CompressedKvReaderU16<'a> {
/// Decompresses the KvReader into the buffer using the provided dictionnary.
pub fn decompress_with<'b>(
&self,
buffer: &'b mut Vec<u8>,
dictionary: &DecoderDictionary,
) -> io::Result<&'b KvReaderU16> {
const TWO_GIGABYTES: usize = 2 * 1024 * 1024 * 1024;
let mut decompressor = Decompressor::with_prepared_dictionary(dictionary)?;
let mut max_size = self.0.len() * 4;
let size = loop {
buffer.resize(max_size, 0);
match decompressor.decompress_to_buffer(self.0, &mut buffer[..max_size]) {
Ok(size) => break size,
// TODO don't do that !!! But what should I do?
Err(e) if e.kind() == ErrorKind::Other && max_size <= TWO_GIGABYTES => {
max_size *= 2
}
Err(e) => return Err(e),
}
};
Ok(KvReaderU16::from_slice(&buffer[..size]))
}
/// Returns the KvReader like it is not compressed.
/// Happends when there is no dictionary yet.
pub fn as_non_compressed(&self) -> &'a KvReaderU16 {
KvReaderU16::from_slice(self.0)
}
/// Decompresses this KvReader if necessary.
pub fn decompress_with_optional_dictionary<'b>(
&self,
buffer: &'b mut Vec<u8>,
dictionary: Option<&DecoderDictionary>,
) -> io::Result<&'b KvReaderU16>
where
'a: 'b,
{
match dictionary {
Some(dict) => self.decompress_with(buffer, dict),
None => Ok(self.as_non_compressed()),
}
}
pub fn decompress_as_owned_with_optinal_dictionary(
&self,
dictionary: Option<&DecoderDictionary>,
) -> io::Result<Cow<'a, KvReaderU16>> {
todo!("Impl owned version of KvReader")
}
}
pub struct CompressedKvWriterU16(Vec<u8>);
impl CompressedKvWriterU16 {
// TODO ask for a KvReaderU16 here
pub fn new_with_dictionary(input: &[u8], dictionary: &EncoderDictionary) -> io::Result<Self> {
let mut compressor = Compressor::with_prepared_dictionary(dictionary)?;
compressor.compress(input).map(CompressedKvWriterU16)
}
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}

View File

@ -1,6 +1,7 @@
mod beu16_str_codec;
mod beu32_str_codec;
mod byte_slice_ref;
mod compressed_obkv_codec;
pub mod facet;
mod field_id_word_count_codec;
mod fst_set_codec;
@ -18,6 +19,9 @@ use thiserror::Error;
pub use self::beu16_str_codec::BEU16StrCodec;
pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::compressed_obkv_codec::{
CompressedKvReaderU16, CompressedKvWriterU16, CompressedObkvCodec,
};
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::fst_set_codec::FstSetCodec;
pub use self::obkv_codec::ObkvCodec;

View File

@ -9,6 +9,7 @@ use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
use roaring::RoaringBitmap;
use rstar::RTree;
use serde::{Deserialize, Serialize};
use zstd::dict::{DecoderDictionary, EncoderDictionary};
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::documents::PrimaryKey;
@ -18,14 +19,17 @@ use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FieldIdCodec, OrderedF64Codec,
};
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
use crate::heed_codec::{
BEU16StrCodec, CompressedKvReaderU16, CompressedObkvCodec, FstSetCodec, StrBEU16Codec,
StrRefCodec,
};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec,
FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, Result, RoaringBitmapCodec,
RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64,
};
@ -69,6 +73,7 @@ pub mod main_key {
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
pub const SEARCH_CUTOFF: &str = "search_cutoff";
pub const DOCUMENT_COMPRESSION_DICTIONARY: &str = "document-compression-dictionary";
pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules";
pub const FACET_SEARCH: &str = "facet_search";
pub const PREFIX_SEARCH: &str = "prefix_search";
@ -167,7 +172,7 @@ pub struct Index {
pub vector_arroy: arroy::Database<Unspecified>,
/// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<BEU32, ObkvCodec>,
pub(crate) documents: Database<BEU32, CompressedObkvCodec>,
}
impl Index {
@ -331,6 +336,50 @@ impl Index {
self.env.prepare_for_closing()
}
/* document compression dictionary */
/// Writes the dictionnary that will further be used to compress the documents.
pub fn put_document_compression_dictionary(
&self,
wtxn: &mut RwTxn,
dictionary: &[u8],
) -> heed::Result<()> {
self.main.remap_types::<Str, Bytes>().put(
wtxn,
main_key::DOCUMENT_COMPRESSION_DICTIONARY,
dictionary,
)
}
/// Deletes the document compression dictionary.
pub fn delete_document_compression_dictionary(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.remap_key_type::<Str>().delete(wtxn, main_key::DOCUMENT_COMPRESSION_DICTIONARY)
}
/// Returns the optional raw bytes dictionary to be used when reading or writing the OBKV documents.
pub fn document_compression_raw_dictionary<'t>(
&self,
rtxn: &'t RoTxn,
) -> heed::Result<Option<&'t [u8]>> {
self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::DOCUMENT_COMPRESSION_DICTIONARY)
}
pub fn document_decompression_dictionary<'t>(
&self,
rtxn: &'t RoTxn,
) -> heed::Result<Option<DecoderDictionary<'t>>> {
self.document_compression_raw_dictionary(rtxn).map(|opt| opt.map(DecoderDictionary::new))
}
pub fn document_compression_dictionary(
&self,
rtxn: &RoTxn,
) -> heed::Result<Option<EncoderDictionary<'static>>> {
const COMPRESSION_LEVEL: i32 = 19;
self.document_compression_raw_dictionary(rtxn)
.map(|opt| opt.map(|bytes| EncoderDictionary::copy(bytes, COMPRESSION_LEVEL)))
}
/* documents ids */
/// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
@ -1258,43 +1307,43 @@ impl Index {
/* documents */
/// Returns a document by using the document id.
pub fn document<'t>(&self, rtxn: &'t RoTxn, id: DocumentId) -> Result<&'t obkv::KvReaderU16> {
pub fn compressed_document<'t>(
&self,
rtxn: &'t RoTxn,
id: DocumentId,
) -> Result<CompressedKvReaderU16<'t>> {
self.documents
.get(rtxn, &id)?
.ok_or(UserError::UnknownInternalDocumentId { document_id: id })
.map_err(Into::into)
}
/// Returns an iterator over the requested documents. The next item will be an error if a document is missing.
pub fn iter_documents<'a, 't: 'a>(
/// Returns an iterator over the requested compressed documents. The next item will be an error if a document is missing.
pub fn iter_compressed_documents<'a, 't: 'a>(
&'a self,
rtxn: &'t RoTxn<'t>,
ids: impl IntoIterator<Item = DocumentId> + 'a,
) -> Result<impl Iterator<Item = Result<(DocumentId, &'t obkv::KvReaderU16)>> + 'a> {
Ok(ids.into_iter().map(move |id| {
let kv = self
.documents
.get(rtxn, &id)?
.ok_or(UserError::UnknownInternalDocumentId { document_id: id })?;
Ok((id, kv))
}))
) -> Result<impl Iterator<Item = Result<(DocumentId, CompressedKvReaderU16<'t>)>> + 'a> {
Ok(ids
.into_iter()
.map(move |id| self.compressed_document(rtxn, id).map(|compressed| (id, compressed))))
}
/// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
pub fn documents<'t>(
pub fn compressed_documents<'t>(
&self,
rtxn: &'t RoTxn<'t>,
ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, &'t obkv::KvReaderU16)>> {
self.iter_documents(rtxn, ids)?.collect()
) -> Result<Vec<(DocumentId, CompressedKvReaderU16<'t>)>> {
self.iter_compressed_documents(rtxn, ids)?.collect()
}
/// Returns an iterator over all the documents in the index.
pub fn all_documents<'a, 't: 'a>(
pub fn all_compressed_documents<'a, 't: 'a>(
&'a self,
rtxn: &'t RoTxn<'t>,
) -> Result<impl Iterator<Item = Result<(DocumentId, &'t obkv::KvReaderU16)>> + 'a> {
self.iter_documents(rtxn, self.documents_ids(rtxn)?)
) -> Result<impl Iterator<Item = Result<(DocumentId, CompressedKvReaderU16<'t>)>> + 'a> {
self.iter_compressed_documents(rtxn, self.documents_ids(rtxn)?)
}
pub fn external_id_of<'a, 't: 'a>(
@ -1315,9 +1364,14 @@ impl Index {
process: "external_id_of",
})
})?;
Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
let (_docid, obkv) = entry?;
match primary_key.document_id(obkv, &fields)? {
let dictionary =
self.document_compression_raw_dictionary(rtxn)?.map(DecoderDictionary::copy);
let mut buffer = Vec::new();
Ok(self.iter_compressed_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
let (_docid, compressed_obkv) = entry?;
let obkv = compressed_obkv
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())?;
match primary_key.document_id(&obkv, &fields)? {
Ok(document_id) => Ok(document_id),
Err(_) => Err(InternalError::DocumentsError(
crate::documents::Error::InvalidDocumentFormat,
@ -2625,7 +2679,12 @@ pub(crate) mod tests {
"###);
let rtxn = index.read_txn().unwrap();
let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0];
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let (_docid, compressed_obkv) = index.compressed_documents(&rtxn, [0]).unwrap().remove(0);
let mut buffer = Vec::new();
let obkv = compressed_obkv
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
insta::assert_debug_snapshot!(json, @r###"
{
@ -2634,7 +2693,10 @@ pub(crate) mod tests {
"###);
// Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34
let (_docid, obkv) = index.documents(&rtxn, [2]).unwrap()[0];
let (_docid, compressed_obkv) = index.compressed_documents(&rtxn, [2]).unwrap().remove(0);
let obkv = compressed_obkv
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
insta::assert_debug_snapshot!(json, @r###"
{
@ -2643,6 +2705,7 @@ pub(crate) mod tests {
}
"###);
drop(dictionary);
drop(rtxn);
// Add new documents again
@ -2841,11 +2904,16 @@ pub(crate) mod tests {
} = search.execute().unwrap();
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
documents_ids.sort_unstable();
let docs = index.documents(&rtxn, documents_ids).unwrap();
let compressed_docs = index.compressed_documents(&rtxn, documents_ids).unwrap();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let mut buffer = Vec::new();
let mut all_ids = HashSet::new();
for (_docid, obkv) in docs {
let id = obkv.get(primary_key_id).unwrap();
assert!(all_ids.insert(id));
for (_docid, compressed) in compressed_docs {
let doc = compressed
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let id = doc.get(primary_key_id).unwrap();
assert!(all_ids.insert(id.to_vec()));
}
}

View File

@ -48,7 +48,7 @@ pub use search::new::{
};
use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {charabia as tokenizer, heed, rhai};
pub use {charabia as tokenizer, heed, rhai, zstd};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::criterion::{default_criteria, Criterion, CriterionError};

View File

@ -25,8 +25,13 @@ fn collect_field_values(
) -> Vec<String> {
let mut values = vec![];
let fid = index.fields_ids_map(txn).unwrap().id(fid).unwrap();
for doc in index.documents(txn, docids.iter().copied()).unwrap() {
if let Some(v) = doc.1.get(fid) {
let mut buffer = Vec::new();
let dictionary = index.document_decompression_dictionary(txn).unwrap();
for (_id, compressed_doc) in index.compressed_documents(txn, docids.iter().copied()).unwrap() {
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
if let Some(v) = doc.get(fid) {
let v: serde_json::Value = serde_json::from_slice(v).unwrap();
let v = v.to_string();
values.push(v);

View File

@ -407,9 +407,15 @@ pub fn snap_documents(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let display = fields_ids_map.ids().collect::<Vec<_>>();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let mut buffer = Vec::new();
for document in index.all_documents(&rtxn).unwrap() {
let doc = obkv_to_json(&display, &fields_ids_map, document.unwrap().1).unwrap();
for result in index.all_compressed_documents(&rtxn).unwrap() {
let (_id, compressed_document) = result.unwrap();
let document = compressed_document
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let doc = obkv_to_json(&display, &fields_ids_map, document).unwrap();
snap.push_str(&serde_json::to_string(&doc).unwrap());
snap.push('\n');
}

View File

@ -62,6 +62,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_document_compression_dictionary(self.wtxn)?;
// Remove all user-provided bits from the configs
let mut configs = self.index.embedding_configs(self.wtxn)?;

View File

@ -4,8 +4,8 @@ mod helpers;
mod transform;
mod typed_chunk;
use std::collections::HashSet;
use std::io::{Read, Seek};
use std::collections::{HashMap, HashSet};
use std::io::{BufWriter, Read, Seek, Write};
use std::iter;
use std::num::NonZeroU32;
use std::sync::Arc;
@ -13,9 +13,8 @@ use std::sync::Arc;
use crossbeam_channel::{Receiver, Sender};
use enrich::enrich_documents_batch;
use grenad::{Merger, MergerBuilder};
use hashbrown::HashMap;
use heed::types::Str;
use heed::Database;
use heed::types::{Bytes, Str};
use heed::{Database, PutFlags};
use rand::SeedableRng as _;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
@ -28,7 +27,8 @@ pub use self::helpers::*;
pub use self::transform::{Transform, TransformOutput};
use super::new::StdResult;
use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError};
use crate::error::{Error, InternalError, UserError};
use crate::heed_codec::{CompressedKvWriterU16, CompressedObkvCodec};
use crate::index::{PrefixSearch, PrefixSettings};
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
@ -36,7 +36,7 @@ use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::vector::{ArroyWrapper, EmbeddingConfigs};
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
use crate::{CboRoaringBitmapCodec, Index, Result, BEU32};
static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 4;
@ -201,7 +201,7 @@ where
target = "indexing::details",
name = "index_documents_raw"
)]
pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
pub fn execute_raw(mut self, output: TransformOutput) -> Result<u64>
where
FP: Fn(UpdateIndexingStep) + Sync,
FA: Fn() -> bool + Sync,
@ -523,6 +523,10 @@ where
word_fid_docids.map(MergerBuilder::build),
)?;
// This call contains an internal condition to ensure we do not always
// generate compression dictionaries and always compress documents.
self.manage_compression_dictionary()?;
Ok(number_of_documents)
}
@ -533,7 +537,7 @@ where
name = "index_documents_prefix_databases"
)]
pub fn execute_prefix_databases(
self,
&mut self,
word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
exact_word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
word_position_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
@ -723,6 +727,64 @@ where
Ok(())
}
/// Computes a new dictionay and compress the documents with it in the database.
///
/// Documents still need to be directly compressed when being written in the database and a dictionary exists.
#[tracing::instrument(
level = "trace",
skip_all,
target = "indexing::compression",
name = "compress_documents_database"
)]
pub fn manage_compression_dictionary(&mut self) -> Result<()> {
/// The size of the dictionary generated from a sample of the documents already
/// in the database. It will be used when compressing and decompressing documents.
const COMPRESSION_DICTIONARY_SIZE: usize = 64_000;
/// The minimum number of documents to trigger the generation of the compression dictionary.
const COMPRESSION_ON_NUMBER_OF_DOCUMENTS: usize = 10_000;
if self.index.number_of_documents(self.wtxn)? < COMPRESSION_ON_NUMBER_OF_DOCUMENTS as u64
|| self.index.document_compression_dictionary(self.wtxn)?.is_some()
{
return Ok(());
}
let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
let mut sample_sizes = Vec::new();
// TODO make this 1_000 be 10k and const
let documents = self.index.documents.remap_types::<BEU32, Bytes>();
for result in documents.iter(self.wtxn)?.take(COMPRESSION_ON_NUMBER_OF_DOCUMENTS) {
let (_id, bytes) = result?;
sample_file.write_all(bytes)?;
sample_sizes.push(bytes.len());
}
let sample_file = sample_file.into_inner().map_err(|ie| ie.into_error())?;
let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
let dictionary =
zstd::dict::from_continuous(&sample_data, &sample_sizes, COMPRESSION_DICTIONARY_SIZE)?;
self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;
// safety: We just set the dictionary above. It must be there when we get it back.
let dictionary = self.index.document_compression_dictionary(self.wtxn)?.unwrap();
let mut iter = self.index.documents.iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let (docid, document) = result?;
let document = document.as_non_compressed().as_bytes();
let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary)?;
// safety: the compressed document is entirely owned
unsafe {
iter.put_current_with_options::<CompressedObkvCodec>(
PutFlags::empty(),
&docid,
&compressed,
)?;
}
}
Ok(())
}
}
/// Run the word prefix docids update operation.
@ -814,7 +876,7 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
let count = index.all_documents(&rtxn).unwrap().count();
let count = index.all_compressed_documents(&rtxn).unwrap().count();
assert_eq!(count, 3);
drop(rtxn);
@ -823,6 +885,7 @@ mod tests {
#[test]
fn simple_document_merge() {
let mut index = TempIndex::new();
let mut buffer = Vec::new();
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
// First we send 3 documents with duplicate ids and
@ -841,16 +904,21 @@ mod tests {
assert_eq!(count, 1);
// Check that we get only one document from the database.
let docs = index.documents(&rtxn, Some(0)).unwrap();
assert_eq!(docs.len(), 1);
let (id, doc) = docs[0];
let mut compressed_docs = index.compressed_documents(&rtxn, Some(0)).unwrap();
assert_eq!(compressed_docs.len(), 1);
let (id, compressed_doc) = compressed_docs.remove(0);
assert_eq!(id, 0);
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
// Check that this document is equal to the last one sent.
let mut doc_iter = doc.iter();
assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
assert_eq!(doc_iter.next(), None);
drop(dictionary);
drop(rtxn);
// Second we send 1 document with id 1, to force it to be merged with the previous one.
@ -862,10 +930,14 @@ mod tests {
assert_eq!(count, 1);
// Check that we get only one document from the database.
let docs = index.documents(&rtxn, Some(0)).unwrap();
assert_eq!(docs.len(), 1);
let (id, doc) = docs[0];
let mut compressed_docs = index.compressed_documents(&rtxn, Some(0)).unwrap();
assert_eq!(compressed_docs.len(), 1);
let (id, compressed_doc) = compressed_docs.remove(0);
assert_eq!(id, 0);
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
// Check that this document is equal to the last one sent.
let mut doc_iter = doc.iter();
@ -873,6 +945,129 @@ mod tests {
assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
assert_eq!(doc_iter.next(), Some((2, &b"25"[..])));
assert_eq!(doc_iter.next(), None);
drop(dictionary);
drop(rtxn);
}
#[test]
fn not_auto_generated_documents_ids() {
let index = TempIndex::new();
let result = index.add_documents(documents!([
{ "name": "kevin" },
{ "name": "kevina" },
{ "name": "benoit" }
]));
assert!(result.is_err());
// Check that there is no document.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 0);
drop(rtxn);
}
#[test]
fn simple_auto_generated_documents_ids() {
let mut index = TempIndex::new();
let mut buffer = Vec::new();
index.index_documents_config.autogenerate_docids = true;
// First we send 3 documents with ids from 1 to 3.
index
.add_documents(documents!([
{ "name": "kevin" },
{ "name": "kevina" },
{ "name": "benoit" }
]))
.unwrap();
// Check that there is 3 documents now.
let rtxn = index.read_txn().unwrap();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
let compressed_docs = index.compressed_documents(&rtxn, vec![0, 1, 2]).unwrap();
let (_id, compressed_obkv) = compressed_docs
.iter()
.find(|(_id, compressed_doc)| {
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
doc.get(0) == Some(br#""kevin""#)
})
.unwrap();
let obkv = compressed_obkv
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let kevin_uuid: String = serde_json::from_slice(obkv.get(1).unwrap()).unwrap();
drop(dictionary);
drop(rtxn);
// Second we send 1 document with the generated uuid, to erase the previous ones.
index.add_documents(documents!([ { "name": "updated kevin", "id": kevin_uuid } ])).unwrap();
// Check that there is **always** 3 documents.
let rtxn = index.read_txn().unwrap();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
// the document 0 has been deleted and reinserted with the id 3
let mut compressed_docs = index.compressed_documents(&rtxn, vec![1, 2, 0]).unwrap();
let kevin_position = compressed_docs
.iter()
.position(|(_, compressed_doc)| {
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
doc.get(0).unwrap() == br#""updated kevin""#
})
.unwrap();
assert_eq!(kevin_position, 2);
let (_, compressed_doc) = compressed_docs.remove(kevin_position);
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
// Check that this document is equal to the last
// one sent and that an UUID has been generated.
assert_eq!(doc.get(0), Some(&br#""updated kevin""#[..]));
// This is an UUID, it must be 36 bytes long plus the 2 surrounding string quotes (").
assert_eq!(doc.get(1).unwrap().len(), 36 + 2);
drop(dictionary);
drop(rtxn);
}
#[test]
fn reordered_auto_generated_documents_ids() {
let mut index = TempIndex::new();
// First we send 3 documents with ids from 1 to 3.
index
.add_documents(documents!([
{ "id": 1, "name": "kevin" },
{ "id": 2, "name": "kevina" },
{ "id": 3, "name": "benoit" }
]))
.unwrap();
// Check that there is 3 documents now.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
drop(rtxn);
// Second we send 1 document without specifying the id.
index.index_documents_config.autogenerate_docids = true;
index.add_documents(documents!([ { "name": "new kevin" } ])).unwrap();
// Check that there is 4 documents now.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 4);
drop(rtxn);
}
@ -974,7 +1169,7 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 6);
let count = index.all_documents(&rtxn).unwrap().count();
let count = index.all_compressed_documents(&rtxn).unwrap().count();
assert_eq!(count, 6);
db_snap!(index, word_docids, "updated");
@ -1392,7 +1587,7 @@ mod tests {
index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
let rtxn = index.read_txn().unwrap();
let all_documents_count = index.all_documents(&rtxn).unwrap().count();
let all_documents_count = index.all_compressed_documents(&rtxn).unwrap().count();
assert_eq!(all_documents_count, 1);
let external_documents_ids = index.external_documents_ids();
assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
@ -2844,7 +3039,7 @@ mod tests {
// Ensuring all the returned IDs actually exists
let rtxn = index.read_txn().unwrap();
let res = index.search(&rtxn).execute().unwrap();
index.documents(&rtxn, res.documents_ids).unwrap();
index.compressed_documents(&rtxn, res.documents_ids).unwrap();
}
fn delete_documents<'t>(
@ -3223,7 +3418,7 @@ mod tests {
let rtxn = index.read_txn().unwrap();
// list all documents
let results = index.all_documents(&rtxn).unwrap();
let results = index.all_compressed_documents(&rtxn).unwrap();
for result in results {
let (id, _) = result.unwrap();
assert!(

View File

@ -174,10 +174,12 @@ impl<'a, 'i> Transform<'a, 'i> {
let external_documents_ids = self.index.external_documents_ids();
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
let dictionary = self.index.document_decompression_dictionary(wtxn)?;
let primary_key = cursor.primary_key().to_string();
let primary_key_id =
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
let mut decompression_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
let mut document_sorter_value_buffer = Vec::new();
let mut document_sorter_key_buffer = Vec::new();
@ -253,18 +255,17 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut skip_insertion = false;
if let Some(original_docid) = original_docid {
let original_key = original_docid;
let base_obkv = self
.index
.documents
.remap_data_type::<heed::types::Bytes>()
.get(wtxn, &original_key)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS,
key: None,
})?;
let base_compressed_obkv = self.index.documents.get(wtxn, &original_key)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
let base_obkv = base_compressed_obkv.decompress_with_optional_dictionary(
&mut decompression_buffer,
dictionary.as_ref(),
)?;
// we check if the two documents are exactly equal. If it's the case we can skip this document entirely
if base_obkv == obkv_buffer {
if base_obkv.as_bytes() == obkv_buffer {
// we're not replacing anything
self.replaced_documents_ids.remove(original_docid);
// and we need to put back the original id as it was before
@ -284,13 +285,12 @@ impl<'a, 'i> Transform<'a, 'i> {
document_sorter_value_buffer.clear();
document_sorter_value_buffer.push(Operation::Addition as u8);
into_del_add_obkv(
KvReaderU16::from_slice(base_obkv),
base_obkv,
deladd_operation,
&mut document_sorter_value_buffer,
)?;
self.original_sorter
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
let base_obkv = KvReader::from_slice(base_obkv);
if let Some(flattened_obkv) =
Self::flatten_from_fields_ids_map(base_obkv, &mut self.fields_ids_map)?
{
@ -354,9 +354,12 @@ impl<'a, 'i> Transform<'a, 'i> {
documents_seen: documents_count,
});
drop(dictionary);
self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?;
self.index.put_primary_key(wtxn, &primary_key)?;
self.documents_count += documents_count;
// Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput.
Ok(documents_count)
@ -857,15 +860,21 @@ impl<'a, 'i> Transform<'a, 'i> {
if original_sorter.is_some() || flattened_sorter.is_some() {
let modified_faceted_fields = settings_diff.modified_faceted_fields();
let dictionary = self.index.document_decompression_dictionary(wtxn)?;
let mut original_obkv_buffer = Vec::new();
let mut flattened_obkv_buffer = Vec::new();
let mut document_sorter_key_buffer = Vec::new();
let mut buffer = Vec::new();
for result in self.index.external_documents_ids().iter(wtxn)? {
let (external_id, docid) = result?;
let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
let old_compressed_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
let old_obkv = old_compressed_obkv
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())?;
let injected_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
arroy::Error,

View File

@ -17,6 +17,7 @@ use super::helpers::{
};
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType;
use crate::heed_codec::CompressedKvWriterU16;
use crate::index::db_name::DOCUMENTS;
use crate::index::IndexEmbeddingConfig;
use crate::proximity::MAX_DISTANCE;
@ -158,6 +159,7 @@ pub(crate) fn write_typed_chunk_into_index(
.into_iter()
.map(|IndexEmbeddingConfig { name, .. }| name)
.collect();
let dictionary = index.document_compression_dictionary(wtxn)?;
let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -207,7 +209,17 @@ pub(crate) fn write_typed_chunk_into_index(
let db = index.documents.remap_data_type::<Bytes>();
if !writer.is_empty() {
db.put(wtxn, &docid, &writer.into_inner().unwrap())?;
let uncompressed_document_bytes = writer.into_inner().unwrap();
match dictionary.as_ref() {
Some(dictionary) => {
let compressed = CompressedKvWriterU16::new_with_dictionary(
&uncompressed_document_bytes,
dictionary,
)?;
db.put(wtxn, &docid, compressed.as_bytes())?
}
None => db.put(wtxn, &docid, &uncompressed_document_bytes)?,
}
operations.push(DocumentOperation {
external_id: external_id.to_string(),
internal_id: docid,

View File

@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet};
use bumparaw_collections::RawMap;
@ -47,23 +48,15 @@ pub trait Document<'doc> {
fn geo_field(&self) -> Result<Option<&'doc RawValue>>;
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct DocumentFromDb<'t, Mapper: FieldIdMapper>
where
Mapper: FieldIdMapper,
{
fields_ids_map: &'t Mapper,
content: &'t KvReaderFieldId,
content: Cow<'t, KvReaderFieldId>,
}
impl<'t, Mapper: FieldIdMapper> Clone for DocumentFromDb<'t, Mapper> {
#[inline]
fn clone(&self) -> Self {
*self
}
}
impl<'t, Mapper: FieldIdMapper> Copy for DocumentFromDb<'t, Mapper> {}
impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> {
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'t str, &'t RawValue)>> {
let mut it = self.content.iter();

View File

@ -54,6 +54,8 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> {
let external_docid = change.external_docid().to_owned();
todo!("manage documents compression");
// document but we need to create a function that collects and compresses documents.
match change {
DocumentChange::Deletion(deletion) => {

View File

@ -1944,6 +1944,8 @@ mod tests {
// Check that the searchable field is correctly set to "name" only.
let rtxn = index.read_txn().unwrap();
let mut buffer = Vec::new();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
// When we search for something that is not in
// the searchable fields it must not return any document.
let result = index.search(&rtxn).query("23").execute().unwrap();
@ -1952,10 +1954,17 @@ mod tests {
// When we search for something that is in the searchable fields
// we must find the appropriate document.
let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap();
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
let mut compressed_documents =
index.compressed_documents(&rtxn, result.documents_ids).unwrap();
let fid_map = index.fields_ids_map(&rtxn).unwrap();
assert_eq!(documents.len(), 1);
assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
assert_eq!(compressed_documents.len(), 1);
let (_id, compressed_document) = compressed_documents.remove(0);
let document = compressed_document
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
assert_eq!(document.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
drop(dictionary);
drop(rtxn);
// We change the searchable fields to be the "name" field only.
@ -1980,6 +1989,7 @@ mod tests {
// Check that the searchable field have been reset and documents are found now.
let rtxn = index.read_txn().unwrap();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let fid_map = index.fields_ids_map(&rtxn).unwrap();
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap();
snapshot!(format!("{user_defined_searchable_fields:?}"), @"None");
@ -1988,8 +1998,13 @@ mod tests {
snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###);
let result = index.search(&rtxn).query("23").execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
let mut compressed_documents =
index.compressed_documents(&rtxn, result.documents_ids).unwrap();
let (_id, compressed_document) = compressed_documents.remove(0);
let document = compressed_document
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
assert_eq!(document.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
}
#[test]
@ -2120,15 +2135,20 @@ mod tests {
// Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap();
let mut buffer = Vec::new();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let fields_ids = index.filterable_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashset! { S("age") });
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let fidmap = index.fields_ids_map(&rtxn).unwrap();
for document in index.all_documents(&rtxn).unwrap() {
let document = document.unwrap();
let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1)
for result in index.all_compressed_documents(&rtxn).unwrap() {
let (_id, compressed_document) = result.unwrap();
let document = compressed_document
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let json =
crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document).unwrap();
println!("json: {:?}", json);
}
let count = index
@ -2139,6 +2159,7 @@ mod tests {
.unwrap()
.count();
assert_eq!(count, 3);
drop(dictionary);
drop(rtxn);
// Index a little more documents with new and current facets values.
@ -2228,6 +2249,7 @@ mod tests {
#[test]
fn set_asc_desc_field() {
let index = TempIndex::new();
let mut buffer = Vec::new();
// Set the filterable fields to be the age.
index
@ -2248,12 +2270,16 @@ mod tests {
// Run an empty query just to ensure that the search results are ordered.
let rtxn = index.read_txn().unwrap();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
let documents = index.documents(&rtxn, documents_ids).unwrap();
let compressed_documents = index.compressed_documents(&rtxn, documents_ids).unwrap();
// Fetch the documents "age" field in the ordre in which the documents appear.
let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap();
let iter = documents.into_iter().map(|(_, doc)| {
let iter = compressed_documents.into_iter().map(|(_, compressed_doc)| {
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let bytes = doc.get(age_field_id).unwrap();
let string = std::str::from_utf8(bytes).unwrap();
string.parse::<u32>().unwrap()
@ -2645,6 +2671,7 @@ mod tests {
#[test]
fn setting_impact_relevancy() {
let index = TempIndex::new();
let mut buffer = Vec::new();
// Set the genres setting
index
@ -2676,8 +2703,12 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap();
let first_id = documents_ids[0];
let documents = index.documents(&rtxn, documents_ids).unwrap();
let (_, content) = documents.iter().find(|(id, _)| *id == first_id).unwrap();
let documents = index.compressed_documents(&rtxn, documents_ids).unwrap();
let (_, compressed_content) = documents.iter().find(|(id, _)| *id == first_id).unwrap();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let content = compressed_content
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
let fid = index.fields_ids_map(&rtxn).unwrap().id("title").unwrap();
let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap();
@ -2851,7 +2882,7 @@ mod tests {
wtxn.commit().unwrap();
let rtxn = index.write_txn().unwrap();
let docs: StdResult<Vec<_>, _> = index.all_documents(&rtxn).unwrap().collect();
let docs: StdResult<Vec<_>, _> = index.all_compressed_documents(&rtxn).unwrap().collect();
let docs = docs.unwrap();
assert_eq!(docs.len(), 5);
}

View File

@ -349,7 +349,20 @@ fn criteria_ascdesc() {
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let documents = index.all_documents(&rtxn).unwrap().map(|doc| doc.unwrap()).collect::<Vec<_>>();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let mut buffers = vec![Vec::new(); index.number_of_documents(&rtxn).unwrap() as usize];
let documents = index
.all_compressed_documents(&rtxn)
.unwrap()
.zip(buffers.iter_mut())
.map(|(compressed, buffer)| {
let (id, compressed) = compressed.unwrap();
let doc = compressed
.decompress_with_optional_dictionary(buffer, dictionary.as_ref())
.unwrap();
(id, doc)
})
.collect::<Vec<_>>();
for criterion in [Asc(S("name")), Desc(S("name")), Asc(S("age")), Desc(S("age"))] {
eprintln!("Testing with criterion: {:?}", &criterion);

0
milli/examples/search.rs Normal file
View File