Generate the dictionary from the first 10k documents

2025-10-11 06:06:32 +00:00 · 2024-07-02 15:49:56 +02:00
parent 0d63d02ab2
commit 767f20e30d
4 changed files with 45 additions and 12 deletions
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -39,6 +39,7 @@ indexmap = { version = "2.2.6", features = ["serde"] }
 json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
 lz4_flex = "0.11.3"
+zstd = { version = "0.11.2", features = ["zdict_builder"] }
 memmap2 = "0.9.4"
 obkv = "0.2.2"
 once_cell = "1.19.0"
--- a/milli/src/heed_codec/compressed_obkv_codec.rs
+++ b/milli/src/heed_codec/compressed_obkv_codec.rs
@@ -28,13 +28,13 @@ impl<'a> CompressedKvReaderU16<'a> {
    pub fn decompress_with<'b>(
        &self,
        buffer: &'b mut Vec<u8>,
-        dictionnary: &[u8],
+        dictionary: &[u8],
    ) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
        let (size, input) = lz4_flex::block::uncompressed_size(self.0)?;
        buffer.resize(size, 0);
        // TODO loop to increase the buffer size of need be
        let size =
-            lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionnary)?;
+            lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionary)?;
        Ok(KvReaderU16::new(&buffer[..size]))
    }

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -5,7 +5,7 @@ mod transform;
 mod typed_chunk;

 use std::collections::{HashMap, HashSet};
-use std::io::{Read, Seek};
+use std::io::{BufWriter, Read, Seek, Write};
 use std::iter;
 use std::num::NonZeroU32;
 use std::result::Result as StdResult;
@@ -41,7 +41,7 @@ use crate::update::{
    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
 use crate::vector::EmbeddingConfigs;
-use crate::{CboRoaringBitmapCodec, Index, Result};
+use crate::{CboRoaringBitmapCodec, Index, Result, BEU32};

 static MERGED_DATABASE_COUNT: usize = 7;
 static PREFIX_DATABASE_COUNT: usize = 4;
@@ -568,7 +568,7 @@ where

        // TODO increase this number to 10k and put it in a const somewhere
        //      I don't like that this dangerous condition is here...
-        if number_of_documents > 1_000
+        if number_of_documents > 10_000
            && self.index.document_compression_dictionary(self.wtxn)?.is_none()
        {
            self.manage_compression_dictionary()?;
@@ -767,17 +767,29 @@ where
        name = "compress_documents_database"
    )]
    pub fn manage_compression_dictionary(&mut self) -> Result<()> {
-        // TODO This is a dumb dictionary, just so you get the idea.
-        //      We need to compute a better one by using zstd or something else.
-        let dictionary = b"movietraileradventurehorror";
-        self.index.put_document_compression_dictionary(self.wtxn, dictionary)?;
+        let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
+        let mut sample_sizes = Vec::new();
+        // TODO make this 1_000 be 10k and const
+        let documents = self.index.documents.remap_types::<BEU32, Bytes>();
+        for result in documents.iter(self.wtxn)?.take(10_000) {
+            let (_id, bytes) = result?;
+            sample_file.write_all(bytes)?;
+            sample_sizes.push(bytes.len());
+        }
+
+        // TODO manage this unwrap correctly
+        let sample_file = sample_file.into_inner().unwrap();
+        let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
+        // TODO make this 64_000 const
+        let dictionary = zstd::dict::from_continuous(&sample_data, &sample_sizes, 64_000)?;
+        self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;

        // TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
        let mut iter = self.index.documents.remap_data_type::<Bytes>().iter_mut(self.wtxn)?;
        while let Some(result) = iter.next() {
            let (docid, document) = result?;
            // TODO manage this unwrap correctly
-            let compressed = CompressedKvWriterU16::new_with_dictionary(document, dictionary);
+            let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary);
            // safety the compressed document is entirely owned
            unsafe {
                iter.put_current_with_options::<CompressedObkvCodec>(