Use an experimental feature to avoid copying 64k in memory

Clean up some parts of the code
Make the tests pass
2025-07-18 12:20:48 +00:00 · 2024-07-10 16:42:01 +02:00 · 2024-07-10 16:37:21 +02:00 · 2024-07-10 16:37:21 +02:00 · 2024-07-10 16:37:21 +02:00 · 2024-07-10 16:37:20 +02:00
23 changed files with 1188 additions and 677 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/fuzzers/src/bin/fuzz-indexing.rs
+++ b/fuzzers/src/bin/fuzz-indexing.rs
@ -110,7 +110,7 @@ fn main() {

                            // after executing a batch we check if the database is corrupted
                            let res = index.search(&wtxn).execute().unwrap();
-                            index.documents(&wtxn, res.documents_ids).unwrap();
+                            index.compressed_documents(&wtxn, res.documents_ids).unwrap();
                            progression.fetch_add(1, Ordering::Relaxed);
                        }
                        wtxn.abort();
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@ -908,16 +908,22 @@ impl IndexScheduler {
                    let mut index_dumper = dump.create_index(uid, &metadata)?;

                    let fields_ids_map = index.fields_ids_map(&rtxn)?;
+                    let dictionary = index.document_decompression_dictionary(&rtxn)?;
                    let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
                    let embedding_configs = index.embedding_configs(&rtxn)?;
+                    let mut buffer = Vec::new();

                    // 3.1. Dump the documents
-                    for ret in index.all_documents(&rtxn)? {
+                    for ret in index.all_compressed_documents(&rtxn)? {
                        if self.must_stop_processing.get() {
                            return Err(Error::AbortedTask);
                        }

-                        let (id, doc) = ret?;
+                        let (id, compressed) = ret?;
+                        let doc = compressed.decompress_with_optional_dictionary(
+                            &mut buffer,
+                            dictionary.as_ref(),
+                        )?;

                        let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;

--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@ -2465,12 +2465,20 @@ mod tests {

        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -2525,12 +2533,20 @@ mod tests {

        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -2904,12 +2920,20 @@ mod tests {
        // has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -2955,12 +2979,20 @@ mod tests {
        // has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -3011,12 +3043,20 @@ mod tests {
        // has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -3129,12 +3169,20 @@ mod tests {
        // has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -3184,12 +3232,20 @@ mod tests {
        // has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -3898,12 +3954,20 @@ mod tests {
        // Has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -3969,12 +4033,20 @@ mod tests {
        // Has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -4037,12 +4109,20 @@ mod tests {
        // Has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -4098,12 +4178,20 @@ mod tests {
        // Has everything being pushed successfully in milli?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -4159,6 +4247,8 @@ mod tests {
        // Is the primary key still what we expect?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let primary_key = index.primary_key(&rtxn).unwrap().unwrap();
        snapshot!(primary_key, @"id");

@ -4166,9 +4256,15 @@ mod tests {
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -4220,6 +4316,8 @@ mod tests {
        // Is the primary key still what we expect?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let primary_key = index.primary_key(&rtxn).unwrap().unwrap();
        snapshot!(primary_key, @"id");

@ -4227,9 +4325,15 @@ mod tests {
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -4303,6 +4407,8 @@ mod tests {
        // Is the primary key still what we expect?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let primary_key = index.primary_key(&rtxn).unwrap().unwrap();
        snapshot!(primary_key, @"id");

@ -4310,9 +4416,15 @@ mod tests {
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -4389,6 +4501,8 @@ mod tests {
        // Is the primary key still what we expect?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let primary_key = index.primary_key(&rtxn).unwrap().unwrap();
        snapshot!(primary_key, @"paw");

@ -4396,9 +4510,15 @@ mod tests {
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -4468,6 +4588,8 @@ mod tests {
        // Is the primary key still what we expect?
        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let primary_key = index.primary_key(&rtxn).unwrap().unwrap();
        snapshot!(primary_key, @"doggoid");

@ -4475,9 +4597,15 @@ mod tests {
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
    }
@ -5120,6 +5248,8 @@ mod tests {
        {
            let index = index_scheduler.index("doggos").unwrap();
            let rtxn = index.read_txn().unwrap();
+            let mut buffer = Vec::new();
+            let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();

            // Ensure the document have been inserted into the relevant bitamp
            let configs = index.embedding_configs(&rtxn).unwrap();
@ -5139,8 +5269,12 @@ mod tests {
            assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
            assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true");

-            let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
+            let (_id, compressed_doc) =
+                index.compressed_documents(&rtxn, std::iter::once(0)).unwrap().remove(0);
            let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+            let doc = compressed_doc
+                .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                .unwrap();
            let doc = obkv_to_json(
                &[
                    fields_ids_map.id("doggo").unwrap(),
@ -5194,6 +5328,8 @@ mod tests {
            {
                let index = index_scheduler.index("doggos").unwrap();
                let rtxn = index.read_txn().unwrap();
+                let mut buffer = Vec::new();
+                let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();

                // Ensure the document have been inserted into the relevant bitamp
                let configs = index.embedding_configs(&rtxn).unwrap();
@ -5216,8 +5352,12 @@ mod tests {
                // remained beagle
                assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true");

-                let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
+                let (_id, compressed_doc) =
+                    index.compressed_documents(&rtxn, std::iter::once(0)).unwrap().remove(0);
                let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
                let doc = obkv_to_json(
                    &[
                        fields_ids_map.id("doggo").unwrap(),
@ -5309,12 +5449,20 @@ mod tests {

        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");

@ -5348,12 +5496,20 @@ mod tests {

        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        // the all the vectors linked to the new specified embedder have been removed
        // Only the unknown embedders stays in the document DB
@ -5456,9 +5612,15 @@ mod tests {

        // the document with the id 3 should have its original embedding updated
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
-        let doc = index.documents(&rtxn, Some(docid)).unwrap()[0];
-        let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap();
+        let (_id, compressed_doc) =
+            index.compressed_documents(&rtxn, Some(docid)).unwrap().remove(0);
+        let doc = compressed_doc
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
+        let doc = obkv_to_json(&field_ids, &field_ids_map, doc).unwrap();
        snapshot!(json_string!(doc), @r###"
        {
          "id": 3,
@ -5570,12 +5732,20 @@ mod tests {

        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###);
        let conf = index.embedding_configs(&rtxn).unwrap();
@ -5610,12 +5780,20 @@ mod tests {

        let index = index_scheduler.index("doggos").unwrap();
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_id, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string(&documents).unwrap(), @"[]");
        let conf = index.embedding_configs(&rtxn).unwrap();
@ -5726,12 +5904,20 @@ mod tests {
        {
            let index = index_scheduler.index("doggos").unwrap();
            let rtxn = index.read_txn().unwrap();
+            let mut buffer = Vec::new();
+            let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
            let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
            let field_ids = field_ids_map.ids().collect::<Vec<_>>();
            let documents = index
-                .all_documents(&rtxn)
+                .all_compressed_documents(&rtxn)
                .unwrap()
-                .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+                .map(|ret| {
+                    let (_id, compressed_doc) = ret.unwrap();
+                    let doc = compressed_doc
+                        .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                        .unwrap();
+                    obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+                })
                .collect::<Vec<_>>();
            snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###);
        }
@ -5761,12 +5947,20 @@ mod tests {
        {
            let index = index_scheduler.index("doggos").unwrap();
            let rtxn = index.read_txn().unwrap();
+            let mut buffer = Vec::new();
+            let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
            let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
            let field_ids = field_ids_map.ids().collect::<Vec<_>>();
            let documents = index
-                .all_documents(&rtxn)
+                .all_compressed_documents(&rtxn)
                .unwrap()
-                .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+                .map(|ret| {
+                    let (_id, compressed_doc) = ret.unwrap();
+                    let doc = compressed_doc
+                        .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                        .unwrap();
+                    obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+                })
                .collect::<Vec<_>>();
            snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###);
        }
@ -5794,12 +5988,20 @@ mod tests {
        {
            let index = index_scheduler.index("doggos").unwrap();
            let rtxn = index.read_txn().unwrap();
+            let mut buffer = Vec::new();
+            let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
            let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
            let field_ids = field_ids_map.ids().collect::<Vec<_>>();
            let documents = index
-                .all_documents(&rtxn)
+                .all_compressed_documents(&rtxn)
                .unwrap()
-                .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+                .map(|ret| {
+                    let (_id, compressed_doc) = ret.unwrap();
+                    let doc = compressed_doc
+                        .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                        .unwrap();
+                    obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+                })
                .collect::<Vec<_>>();

            // FIXME: redaction
--- a/meilisearch-types/src/lib.rs
+++ b/meilisearch-types/src/lib.rs
@ -12,7 +12,7 @@ pub mod star_or;
 pub mod task_view;
 pub mod tasks;
 pub mod versioning;
-pub use milli::{heed, Index};
+pub use milli::{heed, zstd, Index};
 use uuid::Uuid;
 pub use versioning::VERSION_FILE_NAME;
 pub use {milli, serde_cs};
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@ -125,7 +125,7 @@ reqwest = { version = "0.12.5", features = [
 sha-1 = { version = "0.10.1", optional = true }
 static-files = { version = "0.2.4", optional = true }
 tempfile = { version = "3.10.1", optional = true }
-zip = { version = "2.1.3", optional = true }
+zip = { version = "2.1.3", default-features = false, features = ["deflate"], optional = true }

 [features]
 default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
--- a/meilisearch/src/routes/indexes/documents.rs
+++ b/meilisearch/src/routes/indexes/documents.rs
@ -603,44 +603,51 @@ fn some_documents<'a, 't: 'a>(
    retrieve_vectors: RetrieveVectors,
 ) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
    let fields_ids_map = index.fields_ids_map(rtxn)?;
+    let dictionary = index.document_decompression_dictionary(rtxn)?;
    let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
    let embedding_configs = index.embedding_configs(rtxn)?;
+    let mut buffer = Vec::new();

-    Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
-        ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> {
-            let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
-            match retrieve_vectors {
-                RetrieveVectors::Ignore => {}
-                RetrieveVectors::Hide => {
-                    document.remove("_vectors");
-                }
-                RetrieveVectors::Retrieve => {
-                    // Clippy is simply wrong
-                    #[allow(clippy::manual_unwrap_or_default)]
-                    let mut vectors = match document.remove("_vectors") {
-                        Some(Value::Object(map)) => map,
-                        _ => Default::default(),
-                    };
-                    for (name, vector) in index.embeddings(rtxn, key)? {
-                        let user_provided = embedding_configs
-                            .iter()
-                            .find(|conf| conf.name == name)
-                            .is_some_and(|conf| conf.user_provided.contains(key));
-                        let embeddings = ExplicitVectors {
-                            embeddings: Some(vector.into()),
-                            regenerate: !user_provided,
-                        };
-                        vectors.insert(
-                            name,
-                            serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
-                        );
+    Ok(index.iter_compressed_documents(rtxn, doc_ids)?.map(move |ret| {
+        ret.map_err(ResponseError::from).and_then(
+            |(key, compressed_document)| -> Result<_, ResponseError> {
+                let document = compressed_document
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())?;
+                let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
+                match retrieve_vectors {
+                    RetrieveVectors::Ignore => {}
+                    RetrieveVectors::Hide => {
+                        document.remove("_vectors");
+                    }
+                    RetrieveVectors::Retrieve => {
+                        // Clippy is simply wrong
+                        #[allow(clippy::manual_unwrap_or_default)]
+                        let mut vectors = match document.remove("_vectors") {
+                            Some(Value::Object(map)) => map,
+                            _ => Default::default(),
+                        };
+                        for (name, vector) in index.embeddings(rtxn, key)? {
+                            let user_provided = embedding_configs
+                                .iter()
+                                .find(|conf| conf.name == name)
+                                .is_some_and(|conf| conf.user_provided.contains(key));
+                            let embeddings = ExplicitVectors {
+                                embeddings: Some(vector.into()),
+                                regenerate: !user_provided,
+                            };
+                            vectors.insert(
+                                name,
+                                serde_json::to_value(embeddings)
+                                    .map_err(MeilisearchHttpError::from)?,
+                            );
+                        }
+                        document.insert("_vectors".into(), vectors.into());
                    }
-                    document.insert("_vectors".into(), vectors.into());
                }
-            }

-            Ok(document)
-        })
+                Ok(document)
+            },
+        )
    }))
 }

--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@ -1123,10 +1123,16 @@ fn make_hits(
    formatter_builder.crop_marker(format.crop_marker);
    formatter_builder.highlight_prefix(format.highlight_pre_tag);
    formatter_builder.highlight_suffix(format.highlight_post_tag);
+    let decompression_dictionary = index.document_decompression_dictionary(rtxn)?;
+    let mut buffer = Vec::new();
    let mut documents = Vec::new();
    let embedding_configs = index.embedding_configs(rtxn)?;
-    let documents_iter = index.documents(rtxn, documents_ids)?;
-    for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
+    let documents_iter = index.compressed_documents(rtxn, documents_ids)?;
+    for ((id, compressed), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
+        let obkv = compressed
+            .decompress_with_optional_dictionary(&mut buffer, decompression_dictionary.as_ref())
+            // TODO use a better error?
+            .map_err(|e| MeilisearchHttpError::HeedError(e.into()))?;
        // First generate a document with all the displayed fields
        let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;

--- a/meilitool/src/main.rs
+++ b/meilitool/src/main.rs
@ -260,6 +260,7 @@ fn export_a_dump(

    // 4. Dump the indexes
    let mut count = 0;
+    let mut buffer = Vec::new();
    for result in index_mapping.iter(&rtxn)? {
        let (uid, uuid) = result?;
        let index_path = db_path.join("indexes").join(uuid.to_string());
@ -268,6 +269,7 @@ fn export_a_dump(
        })?;

        let rtxn = index.read_txn()?;
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let metadata = IndexMetadata {
            uid: uid.to_owned(),
            primary_key: index.primary_key(&rtxn)?.map(String::from),
@ -280,8 +282,11 @@ fn export_a_dump(
        let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();

        // 4.1. Dump the documents
-        for ret in index.all_documents(&rtxn)? {
-            let (_id, doc) = ret?;
+        for ret in index.all_compressed_documents(&rtxn)? {
+            let (_id, compressed_doc) = ret?;
+            let doc = compressed_doc
+                .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                .unwrap();
            let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
            index_dumper.push_document(&document)?;
        }
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -38,6 +38,7 @@ heed = { version = "0.20.3", default-features = false, features = [
 indexmap = { version = "2.2.6", features = ["serde"] }
 json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
+zstd = { version = "0.13.1", features = ["zdict_builder", "experimental"] }
 memmap2 = "0.9.4"
 obkv = "0.2.2"
 once_cell = "1.19.0"
--- a/milli/examples/search.rs
+++ b/milli/examples/search.rs
@ -30,6 +30,7 @@ fn main() -> Result<(), Box<dyn Error>> {

    let index = Index::new(options, dataset)?;
    let txn = index.read_txn()?;
+    let dictionary = index.document_decompression_dictionary(&txn).unwrap();
    let mut query = String::new();
    while stdin().read_line(&mut query)? > 0 {
        for _ in 0..2 {
@ -49,6 +50,7 @@ fn main() -> Result<(), Box<dyn Error>> {
            let start = Instant::now();

            let mut ctx = SearchContext::new(&index, &txn)?;
+            let mut buffer = Vec::new();
            let universe = filtered_universe(ctx.index, ctx.txn, &None)?;

            let docs = execute_search(
@ -75,11 +77,14 @@ fn main() -> Result<(), Box<dyn Error>> {
            let elapsed = start.elapsed();
            println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
            if print_documents {
-                let documents = index
-                    .documents(&txn, docs.documents_ids.iter().copied())
+                let compressed_documents = index
+                    .compressed_documents(&txn, docs.documents_ids.iter().copied())
                    .unwrap()
                    .into_iter()
-                    .map(|(id, obkv)| {
+                    .map(|(id, compressed_obkv)| {
+                        let obkv = compressed_obkv
+                            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                            .unwrap();
                        let mut object = serde_json::Map::default();
                        for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
                            let value = obkv.get(fid).unwrap();
@ -90,17 +95,20 @@ fn main() -> Result<(), Box<dyn Error>> {
                    })
                    .collect::<Vec<_>>();

-                for (id, document) in documents {
+                for (id, document) in compressed_documents {
                    println!("{id}:");
                    println!("{document}");
                }

-                let documents = index
-                    .documents(&txn, docs.documents_ids.iter().copied())
+                let compressed_documents = index
+                    .compressed_documents(&txn, docs.documents_ids.iter().copied())
                    .unwrap()
                    .into_iter()
-                    .map(|(id, obkv)| {
+                    .map(|(id, compressed_obkv)| {
                        let mut object = serde_json::Map::default();
+                        let obkv = compressed_obkv
+                            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                            .unwrap();
                        for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
                            let value = obkv.get(fid).unwrap();
                            let value: serde_json::Value = serde_json::from_slice(value).unwrap();
@ -110,7 +118,7 @@ fn main() -> Result<(), Box<dyn Error>> {
                    })
                    .collect::<Vec<_>>();
                println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
-                for (id, document) in documents {
+                for (id, document) in compressed_documents {
                    println!("{id}:");
                    println!("{document}");
                }
--- a/milli/src/heed_codec/compressed_obkv_codec.rs
+++ b/milli/src/heed_codec/compressed_obkv_codec.rs
@ -0,0 +1,89 @@
+use std::borrow::Cow;
+use std::io;
+use std::io::ErrorKind;
+
+use heed::BoxedError;
+use obkv::KvReaderU16;
+use zstd::bulk::{Compressor, Decompressor};
+use zstd::dict::{DecoderDictionary, EncoderDictionary};
+
+pub struct CompressedObkvCodec;
+
+impl<'a> heed::BytesDecode<'a> for CompressedObkvCodec {
+    type DItem = CompressedKvReaderU16<'a>;
+
+    fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
+        Ok(CompressedKvReaderU16(bytes))
+    }
+}
+
+impl heed::BytesEncode<'_> for CompressedObkvCodec {
+    type EItem = CompressedKvWriterU16;
+
+    fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
+        Ok(Cow::Borrowed(&item.0))
+    }
+}
+
+pub struct CompressedKvReaderU16<'a>(&'a [u8]);
+
+impl<'a> CompressedKvReaderU16<'a> {
+    /// Decompresses the KvReader into the buffer using the provided dictionnary.
+    pub fn decompress_with<'b>(
+        &self,
+        buffer: &'b mut Vec<u8>,
+        dictionary: &DecoderDictionary,
+    ) -> io::Result<KvReaderU16<'b>> {
+        const TWO_GIGABYTES: usize = 2 * 1024 * 1024 * 1024;
+
+        let mut decompressor = Decompressor::with_prepared_dictionary(dictionary)?;
+        let mut max_size = self.0.len() * 4;
+        let size = loop {
+            buffer.resize(max_size, 0);
+            match decompressor.decompress_to_buffer(self.0, &mut buffer[..max_size]) {
+                Ok(size) => break size,
+                // TODO don't do that !!! But what should I do?
+                Err(e) if e.kind() == ErrorKind::Other && max_size <= TWO_GIGABYTES => {
+                    max_size *= 2
+                }
+                Err(e) => return Err(e),
+            }
+        };
+        Ok(KvReaderU16::new(&buffer[..size]))
+    }
+
+    /// Returns the KvReader like it is not compressed.
+    /// Happends when there is no dictionary yet.
+    pub fn as_non_compressed(&self) -> KvReaderU16<'a> {
+        KvReaderU16::new(self.0)
+    }
+
+    /// Decompresses this KvReader if necessary.
+    pub fn decompress_with_optional_dictionary<'b>(
+        &self,
+        buffer: &'b mut Vec<u8>,
+        dictionary: Option<&DecoderDictionary>,
+    ) -> io::Result<KvReaderU16<'b>>
+    where
+        'a: 'b,
+    {
+        match dictionary {
+            Some(dict) => self.decompress_with(buffer, dict),
+            None => Ok(self.as_non_compressed()),
+        }
+    }
+}
+
+pub struct CompressedKvWriterU16(Vec<u8>);
+
+impl CompressedKvWriterU16 {
+    // TODO ask for a KvReaderU16 here
+    pub fn new_with_dictionary(input: &[u8], dictionary: &EncoderDictionary) -> io::Result<Self> {
+        let mut compressor = Compressor::with_prepared_dictionary(dictionary)?;
+        compressor.compress(input).map(CompressedKvWriterU16)
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@ -1,6 +1,7 @@
 mod beu16_str_codec;
 mod beu32_str_codec;
 mod byte_slice_ref;
+mod compressed_obkv_codec;
 pub mod facet;
 mod field_id_word_count_codec;
 mod fst_set_codec;
@ -19,6 +20,9 @@ use thiserror::Error;

 pub use self::beu16_str_codec::BEU16StrCodec;
 pub use self::beu32_str_codec::BEU32StrCodec;
+pub use self::compressed_obkv_codec::{
+    CompressedKvReaderU16, CompressedKvWriterU16, CompressedObkvCodec,
+};
 pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
 pub use self::fst_set_codec::FstSetCodec;
 pub use self::obkv_codec::ObkvCodec;
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -11,6 +11,7 @@ use roaring::RoaringBitmap;
 use rstar::RTree;
 use serde::{Deserialize, Serialize};
 use time::OffsetDateTime;
+use zstd::dict::{DecoderDictionary, EncoderDictionary};

 use crate::documents::PrimaryKey;
 use crate::error::{InternalError, UserError};
@ -20,7 +21,8 @@ use crate::heed_codec::facet::{
    FieldIdCodec, OrderedF64Codec,
 };
 use crate::heed_codec::{
-    BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
+    BEU16StrCodec, CompressedKvReaderU16, CompressedObkvCodec, FstSetCodec, ScriptLanguageCodec,
+    StrBEU16Codec, StrRefCodec,
 };
 use crate::order_by_map::OrderByMap;
 use crate::proximity::ProximityPrecision;
@ -29,8 +31,8 @@ use crate::vector::{Embedding, EmbeddingConfig};
 use crate::{
    default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
    FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
-    FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
-    Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64,
+    FieldidsWeightsMap, GeoPoint, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search,
+    U8StrStrCodec, Weight, BEU16, BEU32, BEU64,
 };

 pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -73,6 +75,7 @@ pub mod main_key {
    pub const PROXIMITY_PRECISION: &str = "proximity-precision";
    pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
    pub const SEARCH_CUTOFF: &str = "search_cutoff";
+    pub const DOCUMENT_COMPRESSION_DICTIONARY: &str = "document-compression-dictionary";
 }

 pub mod db_name {
@ -172,7 +175,7 @@ pub struct Index {
    pub vector_arroy: arroy::Database<arroy::distances::Angular>,

    /// Maps the document id to the document as an obkv store.
-    pub(crate) documents: Database<BEU32, ObkvCodec>,
+    pub(crate) documents: Database<BEU32, CompressedObkvCodec>,
 }

 impl Index {
@ -339,6 +342,50 @@ impl Index {
        self.env.prepare_for_closing()
    }

+    /* document compression dictionary */
+
+    /// Writes the dictionnary that will further be used to compress the documents.
+    pub fn put_document_compression_dictionary(
+        &self,
+        wtxn: &mut RwTxn,
+        dictionary: &[u8],
+    ) -> heed::Result<()> {
+        self.main.remap_types::<Str, Bytes>().put(
+            wtxn,
+            main_key::DOCUMENT_COMPRESSION_DICTIONARY,
+            dictionary,
+        )
+    }
+
+    /// Deletes the document compression dictionary.
+    pub fn delete_document_compression_dictionary(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
+        self.main.remap_key_type::<Str>().delete(wtxn, main_key::DOCUMENT_COMPRESSION_DICTIONARY)
+    }
+
+    /// Returns the optional raw bytes dictionary to be used when reading or writing the OBKV documents.
+    pub fn document_compression_raw_dictionary<'t>(
+        &self,
+        rtxn: &'t RoTxn,
+    ) -> heed::Result<Option<&'t [u8]>> {
+        self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::DOCUMENT_COMPRESSION_DICTIONARY)
+    }
+
+    pub fn document_decompression_dictionary<'t>(
+        &self,
+        rtxn: &'t RoTxn,
+    ) -> heed::Result<Option<DecoderDictionary<'t>>> {
+        self.document_compression_raw_dictionary(rtxn).map(|opt| opt.map(DecoderDictionary::new))
+    }
+
+    pub fn document_compression_dictionary(
+        &self,
+        rtxn: &RoTxn,
+    ) -> heed::Result<Option<EncoderDictionary<'static>>> {
+        const COMPRESSION_LEVEL: i32 = 19;
+        self.document_compression_raw_dictionary(rtxn)
+            .map(|opt| opt.map(|bytes| EncoderDictionary::copy(bytes, COMPRESSION_LEVEL)))
+    }
+
    /* documents ids */

    /// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
@ -1261,36 +1308,36 @@ impl Index {

    /* documents */

-    /// Returns an iterator over the requested documents. The next item will be an error if a document is missing.
-    pub fn iter_documents<'a, 't: 'a>(
+    /// Returns an iterator over the requested compressed documents. The next item will be an error if a document is missing.
+    pub fn iter_compressed_documents<'a, 't: 'a>(
        &'a self,
        rtxn: &'t RoTxn<'t>,
        ids: impl IntoIterator<Item = DocumentId> + 'a,
-    ) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
+    ) -> Result<impl Iterator<Item = Result<(DocumentId, CompressedKvReaderU16<'t>)>> + 'a> {
        Ok(ids.into_iter().map(move |id| {
-            let kv = self
+            let compressed = self
                .documents
                .get(rtxn, &id)?
                .ok_or(UserError::UnknownInternalDocumentId { document_id: id })?;
-            Ok((id, kv))
+            Ok((id, compressed))
        }))
    }

    /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
-    pub fn documents<'t>(
+    pub fn compressed_documents<'t>(
        &self,
        rtxn: &'t RoTxn<'t>,
        ids: impl IntoIterator<Item = DocumentId>,
-    ) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
-        self.iter_documents(rtxn, ids)?.collect()
+    ) -> Result<Vec<(DocumentId, CompressedKvReaderU16<'t>)>> {
+        self.iter_compressed_documents(rtxn, ids)?.collect()
    }

    /// Returns an iterator over all the documents in the index.
-    pub fn all_documents<'a, 't: 'a>(
+    pub fn all_compressed_documents<'a, 't: 'a>(
        &'a self,
        rtxn: &'t RoTxn<'t>,
-    ) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
-        self.iter_documents(rtxn, self.documents_ids(rtxn)?)
+    ) -> Result<impl Iterator<Item = Result<(DocumentId, CompressedKvReaderU16<'t>)>> + 'a> {
+        self.iter_compressed_documents(rtxn, self.documents_ids(rtxn)?)
    }

    pub fn external_id_of<'a, 't: 'a>(
@ -1311,8 +1358,13 @@ impl Index {
                process: "external_id_of",
            })
        })?;
-        Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
-            let (_docid, obkv) = entry?;
+        let dictionary =
+            self.document_compression_raw_dictionary(rtxn)?.map(DecoderDictionary::copy);
+        let mut buffer = Vec::new();
+        Ok(self.iter_compressed_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
+            let (_docid, compressed_obkv) = entry?;
+            let obkv = compressed_obkv
+                .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())?;
            match primary_key.document_id(&obkv, &fields)? {
                Ok(document_id) => Ok(document_id),
                Err(_) => Err(InternalError::DocumentsError(
@ -2441,7 +2493,12 @@ pub(crate) mod tests {
        "###);

        let rtxn = index.read_txn().unwrap();
-        let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0];
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+        let (_docid, compressed_obkv) = index.compressed_documents(&rtxn, [0]).unwrap().remove(0);
+        let mut buffer = Vec::new();
+        let obkv = compressed_obkv
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
        let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
        insta::assert_debug_snapshot!(json, @r###"
        {
@ -2450,7 +2507,10 @@ pub(crate) mod tests {
        "###);

        // Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34
-        let (_docid, obkv) = index.documents(&rtxn, [2]).unwrap()[0];
+        let (_docid, compressed_obkv) = index.compressed_documents(&rtxn, [2]).unwrap().remove(0);
+        let obkv = compressed_obkv
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
        let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
        insta::assert_debug_snapshot!(json, @r###"
        {
@ -2459,6 +2519,7 @@ pub(crate) mod tests {
        }
        "###);

+        drop(dictionary);
        drop(rtxn);

        // Add new documents again
@ -2657,11 +2718,16 @@ pub(crate) mod tests {
        } = search.execute().unwrap();
        let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
        documents_ids.sort_unstable();
-        let docs = index.documents(&rtxn, documents_ids).unwrap();
+        let compressed_docs = index.compressed_documents(&rtxn, documents_ids).unwrap();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+        let mut buffer = Vec::new();
        let mut all_ids = HashSet::new();
-        for (_docid, obkv) in docs {
-            let id = obkv.get(primary_key_id).unwrap();
-            assert!(all_ids.insert(id));
+        for (_docid, compressed) in compressed_docs {
+            let doc = compressed
+                .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                .unwrap();
+            let id = doc.get(primary_key_id).unwrap();
+            assert!(all_ids.insert(id.to_vec()));
        }
    }

--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -45,7 +45,7 @@ pub use search::new::{
 };
 use serde_json::Value;
 pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
-pub use {charabia as tokenizer, heed};
+pub use {charabia as tokenizer, heed, zstd};

 pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
 pub use self::criterion::{default_criteria, Criterion, CriterionError};
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@ -24,8 +24,13 @@ fn collect_field_values(
 ) -> Vec<String> {
    let mut values = vec![];
    let fid = index.fields_ids_map(txn).unwrap().id(fid).unwrap();
-    for doc in index.documents(txn, docids.iter().copied()).unwrap() {
-        if let Some(v) = doc.1.get(fid) {
+    let mut buffer = Vec::new();
+    let dictionary = index.document_decompression_dictionary(txn).unwrap();
+    for (_id, compressed_doc) in index.compressed_documents(txn, docids.iter().copied()).unwrap() {
+        let doc = compressed_doc
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
+        if let Some(v) = doc.get(fid) {
            let v: serde_json::Value = serde_json::from_slice(v).unwrap();
            let v = v.to_string();
            values.push(v);
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@ -407,9 +407,15 @@ pub fn snap_documents(index: &Index) -> String {
    let rtxn = index.read_txn().unwrap();
    let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
    let display = fields_ids_map.ids().collect::<Vec<_>>();
+    let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+    let mut buffer = Vec::new();

-    for document in index.all_documents(&rtxn).unwrap() {
-        let doc = obkv_to_json(&display, &fields_ids_map, document.unwrap().1).unwrap();
+    for result in index.all_compressed_documents(&rtxn).unwrap() {
+        let (_id, compressed_document) = result.unwrap();
+        let document = compressed_document
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
+        let doc = obkv_to_json(&display, &fields_ids_map, document).unwrap();
        snap.push_str(&serde_json::to_string(&doc).unwrap());
        snap.push('\n');
    }
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -63,6 +63,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
        self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
        self.index.delete_geo_rtree(self.wtxn)?;
        self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
+        self.index.delete_document_compression_dictionary(self.wtxn)?;

        // Remove all user-provided bits from the configs
        let mut configs = self.index.embedding_configs(self.wtxn)?;
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -5,7 +5,7 @@ mod transform;
 mod typed_chunk;

 use std::collections::{HashMap, HashSet};
-use std::io::{Read, Seek};
+use std::io::{BufWriter, Read, Seek, Write};
 use std::iter;
 use std::num::NonZeroU32;
 use std::result::Result as StdResult;
@ -13,8 +13,8 @@ use std::sync::Arc;

 use crossbeam_channel::{Receiver, Sender};
 use grenad::{Merger, MergerBuilder};
-use heed::types::Str;
-use heed::Database;
+use heed::types::{Bytes, Str};
+use heed::{Database, PutFlags};
 use rand::SeedableRng;
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
@ -34,13 +34,14 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
 use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
+use crate::heed_codec::{CompressedKvWriterU16, CompressedObkvCodec};
 use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
 use crate::vector::EmbeddingConfigs;
-use crate::{CboRoaringBitmapCodec, Index, Result};
+use crate::{CboRoaringBitmapCodec, Index, Result, BEU32};

 static MERGED_DATABASE_COUNT: usize = 7;
 static PREFIX_DATABASE_COUNT: usize = 4;
@ -266,7 +267,7 @@ where
        target = "indexing::details",
        name = "index_documents_raw"
    )]
-    pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
+    pub fn execute_raw(mut self, output: TransformOutput) -> Result<u64>
    where
        FP: Fn(UpdateIndexingStep) + Sync,
        FA: Fn() -> bool + Sync,
@ -565,6 +566,10 @@ where
            word_fid_docids.map(MergerBuilder::build),
        )?;

+        // This call contains an internal condition to ensure we do not always
+        // generate compression dictionaries and always compress documents.
+        self.manage_compression_dictionary()?;
+
        Ok(number_of_documents)
    }

@ -575,7 +580,7 @@ where
        name = "index_documents_prefix_databases"
    )]
    pub fn execute_prefix_databases(
-        self,
+        &mut self,
        word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
        exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
        word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
@ -747,6 +752,64 @@ where

        Ok(())
    }
+
+    /// Computes a new dictionay and compress the documents with it in the database.
+    ///
+    /// Documents still need to be directly compressed when being written in the database and a dictionary exists.
+    #[tracing::instrument(
+        level = "trace",
+        skip_all,
+        target = "indexing::compression",
+        name = "compress_documents_database"
+    )]
+    pub fn manage_compression_dictionary(&mut self) -> Result<()> {
+        /// The size of the dictionary generated from a sample of the documents already
+        /// in the database. It will be used when compressing and decompressing documents.
+        const COMPRESSION_DICTIONARY_SIZE: usize = 64_000;
+        /// The minimum number of documents to trigger the generation of the compression dictionary.
+        const COMPRESSION_ON_NUMBER_OF_DOCUMENTS: usize = 10_000;
+
+        if self.index.number_of_documents(self.wtxn)? < COMPRESSION_ON_NUMBER_OF_DOCUMENTS as u64
+            || self.index.document_compression_dictionary(self.wtxn)?.is_some()
+        {
+            return Ok(());
+        }
+
+        let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
+        let mut sample_sizes = Vec::new();
+        // TODO make this 1_000 be 10k and const
+        let documents = self.index.documents.remap_types::<BEU32, Bytes>();
+        for result in documents.iter(self.wtxn)?.take(COMPRESSION_ON_NUMBER_OF_DOCUMENTS) {
+            let (_id, bytes) = result?;
+            sample_file.write_all(bytes)?;
+            sample_sizes.push(bytes.len());
+        }
+
+        let sample_file = sample_file.into_inner().map_err(|ie| ie.into_error())?;
+        let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
+        let dictionary =
+            zstd::dict::from_continuous(&sample_data, &sample_sizes, COMPRESSION_DICTIONARY_SIZE)?;
+        self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;
+        // safety: We just set the dictionary above. It must be there when we get it back.
+        let dictionary = self.index.document_compression_dictionary(self.wtxn)?.unwrap();
+
+        let mut iter = self.index.documents.iter_mut(self.wtxn)?;
+        while let Some(result) = iter.next() {
+            let (docid, document) = result?;
+            let document = document.as_non_compressed().as_bytes();
+            let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary)?;
+            // safety: the compressed document is entirely owned
+            unsafe {
+                iter.put_current_with_options::<CompressedObkvCodec>(
+                    PutFlags::empty(),
+                    &docid,
+                    &compressed,
+                )?;
+            }
+        }
+
+        Ok(())
+    }
 }

 /// Run the word prefix docids update operation.
@ -834,7 +897,7 @@ mod tests {
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
-        let count = index.all_documents(&rtxn).unwrap().count();
+        let count = index.all_compressed_documents(&rtxn).unwrap().count();
        assert_eq!(count, 3);

        drop(rtxn);
@ -843,6 +906,7 @@ mod tests {
    #[test]
    fn simple_document_merge() {
        let mut index = TempIndex::new();
+        let mut buffer = Vec::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        // First we send 3 documents with duplicate ids and
@ -861,16 +925,21 @@ mod tests {
        assert_eq!(count, 1);

        // Check that we get only one document from the database.
-        let docs = index.documents(&rtxn, Some(0)).unwrap();
-        assert_eq!(docs.len(), 1);
-        let (id, doc) = docs[0];
+        let mut compressed_docs = index.compressed_documents(&rtxn, Some(0)).unwrap();
+        assert_eq!(compressed_docs.len(), 1);
+        let (id, compressed_doc) = compressed_docs.remove(0);
        assert_eq!(id, 0);
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+        let doc = compressed_doc
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();

        // Check that this document is equal to the last one sent.
        let mut doc_iter = doc.iter();
        assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
        assert_eq!(doc_iter.next(), None);
+        drop(dictionary);
        drop(rtxn);

        // Second we send 1 document with id 1, to force it to be merged with the previous one.
@ -882,10 +951,14 @@ mod tests {
        assert_eq!(count, 1);

        // Check that we get only one document from the database.
-        let docs = index.documents(&rtxn, Some(0)).unwrap();
-        assert_eq!(docs.len(), 1);
-        let (id, doc) = docs[0];
+        let mut compressed_docs = index.compressed_documents(&rtxn, Some(0)).unwrap();
+        assert_eq!(compressed_docs.len(), 1);
+        let (id, compressed_doc) = compressed_docs.remove(0);
        assert_eq!(id, 0);
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+        let doc = compressed_doc
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();

        // Check that this document is equal to the last one sent.
        let mut doc_iter = doc.iter();
@ -893,6 +966,7 @@ mod tests {
        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
        assert_eq!(doc_iter.next(), Some((2, &b"25"[..])));
        assert_eq!(doc_iter.next(), None);
+        drop(dictionary);
        drop(rtxn);
    }

@ -917,6 +991,7 @@ mod tests {
    #[test]
    fn simple_auto_generated_documents_ids() {
        let mut index = TempIndex::new();
+        let mut buffer = Vec::new();
        index.index_documents_config.autogenerate_docids = true;
        // First we send 3 documents with ids from 1 to 3.
        index
@ -929,12 +1004,26 @@ mod tests {

        // Check that there is 3 documents now.
        let rtxn = index.read_txn().unwrap();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);

-        let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
-        let (_id, obkv) = docs.iter().find(|(_id, kv)| kv.get(0) == Some(br#""kevin""#)).unwrap();
+        let compressed_docs = index.compressed_documents(&rtxn, vec![0, 1, 2]).unwrap();
+        let (_id, compressed_obkv) = compressed_docs
+            .iter()
+            .find(|(_id, compressed_doc)| {
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                doc.get(0) == Some(br#""kevin""#)
+            })
+            .unwrap();
+
+        let obkv = compressed_obkv
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
        let kevin_uuid: String = serde_json::from_slice(obkv.get(1).unwrap()).unwrap();
+        drop(dictionary);
        drop(rtxn);

        // Second we send 1 document with the generated uuid, to erase the previous ones.
@ -942,21 +1031,34 @@ mod tests {

        // Check that there is **always** 3 documents.
        let rtxn = index.read_txn().unwrap();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);

        // the document 0 has been deleted and reinserted with the id 3
-        let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
-        let kevin_position =
-            docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
+        let mut compressed_docs = index.compressed_documents(&rtxn, vec![1, 2, 0]).unwrap();
+        let kevin_position = compressed_docs
+            .iter()
+            .position(|(_, compressed_doc)| {
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+
+                doc.get(0).unwrap() == br#""updated kevin""#
+            })
+            .unwrap();
        assert_eq!(kevin_position, 2);
-        let (_, doc) = docs[kevin_position];
+        let (_, compressed_doc) = compressed_docs.remove(kevin_position);
+        let doc = compressed_doc
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();

        // Check that this document is equal to the last
        // one sent and that an UUID has been generated.
        assert_eq!(doc.get(0), Some(&br#""updated kevin""#[..]));
        // This is an UUID, it must be 36 bytes long plus the 2 surrounding string quotes (").
        assert_eq!(doc.get(1).unwrap().len(), 36 + 2);
+        drop(dictionary);
        drop(rtxn);
    }

@ -1088,7 +1190,7 @@ mod tests {
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 6);
-        let count = index.all_documents(&rtxn).unwrap().count();
+        let count = index.all_compressed_documents(&rtxn).unwrap().count();
        assert_eq!(count, 6);

        db_snap!(index, word_docids, "updated");
@ -1506,7 +1608,7 @@ mod tests {
        index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap();

        let rtxn = index.read_txn().unwrap();
-        let all_documents_count = index.all_documents(&rtxn).unwrap().count();
+        let all_documents_count = index.all_compressed_documents(&rtxn).unwrap().count();
        assert_eq!(all_documents_count, 1);
        let external_documents_ids = index.external_documents_ids();
        assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
@ -2796,7 +2898,7 @@ mod tests {
        // Ensuring all the returned IDs actually exists
        let rtxn = index.read_txn().unwrap();
        let res = index.search(&rtxn).execute().unwrap();
-        index.documents(&rtxn, res.documents_ids).unwrap();
+        index.compressed_documents(&rtxn, res.documents_ids).unwrap();
    }

    fn delete_documents<'t>(
@ -3163,7 +3265,7 @@ mod tests {
        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);

        // list all documents
-        let results = index.all_documents(&wtxn).unwrap();
+        let results = index.all_compressed_documents(&wtxn).unwrap();
        for result in results {
            let (id, _) = result.unwrap();
            assert!(
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -168,10 +168,12 @@ impl<'a, 'i> Transform<'a, 'i> {
        let external_documents_ids = self.index.external_documents_ids();
        let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;

+        let dictionary = self.index.document_decompression_dictionary(wtxn)?;
        let primary_key = cursor.primary_key().to_string();
        let primary_key_id =
            self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;

+        let mut decompression_buffer = Vec::new();
        let mut obkv_buffer = Vec::new();
        let mut document_sorter_value_buffer = Vec::new();
        let mut document_sorter_key_buffer = Vec::new();
@ -247,18 +249,17 @@ impl<'a, 'i> Transform<'a, 'i> {
            let mut skip_insertion = false;
            if let Some(original_docid) = original_docid {
                let original_key = original_docid;
-                let base_obkv = self
-                    .index
-                    .documents
-                    .remap_data_type::<heed::types::Bytes>()
-                    .get(wtxn, &original_key)?
-                    .ok_or(InternalError::DatabaseMissingEntry {
-                        db_name: db_name::DOCUMENTS,
-                        key: None,
-                    })?;
+                let base_compressed_obkv = self.index.documents.get(wtxn, &original_key)?.ok_or(
+                    InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
+                )?;
+
+                let base_obkv = base_compressed_obkv.decompress_with_optional_dictionary(
+                    &mut decompression_buffer,
+                    dictionary.as_ref(),
+                )?;

                // we check if the two documents are exactly equal. If it's the case we can skip this document entirely
-                if base_obkv == obkv_buffer {
+                if base_obkv.as_bytes() == obkv_buffer {
                    // we're not replacing anything
                    self.replaced_documents_ids.remove(original_docid);
                    // and we need to put back the original id as it was before
@ -278,13 +279,12 @@ impl<'a, 'i> Transform<'a, 'i> {
                    document_sorter_value_buffer.clear();
                    document_sorter_value_buffer.push(Operation::Addition as u8);
                    into_del_add_obkv(
-                        KvReaderU16::new(base_obkv),
+                        base_obkv,
                        deladd_operation,
                        &mut document_sorter_value_buffer,
                    )?;
                    self.original_sorter
                        .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
-                    let base_obkv = KvReader::new(base_obkv);
                    if let Some(flattened_obkv) =
                        Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)?
                    {
@ -348,9 +348,12 @@ impl<'a, 'i> Transform<'a, 'i> {
            documents_seen: documents_count,
        });

+        drop(dictionary);
+
        self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?;
        self.index.put_primary_key(wtxn, &primary_key)?;
        self.documents_count += documents_count;
+
        // Now that we have a valid sorter that contains the user id and the obkv we
        // give it to the last transforming function which returns the TransformOutput.
        Ok(documents_count)
@ -1035,15 +1038,21 @@ impl<'a, 'i> Transform<'a, 'i> {

        if original_sorter.is_some() || flattened_sorter.is_some() {
            let modified_faceted_fields = settings_diff.modified_faceted_fields();
+            let dictionary = self.index.document_decompression_dictionary(wtxn)?;
+
            let mut original_obkv_buffer = Vec::new();
            let mut flattened_obkv_buffer = Vec::new();
            let mut document_sorter_key_buffer = Vec::new();
+            let mut buffer = Vec::new();
            for result in self.index.external_documents_ids().iter(wtxn)? {
                let (external_id, docid) = result?;
-                let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
+                let old_compressed_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
                    InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
                )?;

+                let old_obkv = old_compressed_obkv
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())?;
+
                let injected_vectors: std::result::Result<
                    serde_json::Map<String, serde_json::Value>,
                    arroy::Error,
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -19,6 +19,7 @@ use super::helpers::{
 use super::MergeFn;
 use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
 use crate::facet::FacetType;
+use crate::heed_codec::CompressedKvWriterU16;
 use crate::index::db_name::DOCUMENTS;
 use crate::index::IndexEmbeddingConfig;
 use crate::proximity::MAX_DISTANCE;
@ -162,6 +163,7 @@ pub(crate) fn write_typed_chunk_into_index(
                .into_iter()
                .map(|IndexEmbeddingConfig { name, .. }| name)
                .collect();
+            let dictionary = index.document_compression_dictionary(wtxn)?;
            let mut vectors_buffer = Vec::new();
            while let Some((key, reader)) = iter.next()? {
                let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -211,7 +213,17 @@ pub(crate) fn write_typed_chunk_into_index(
                let db = index.documents.remap_data_type::<Bytes>();

                if !writer.is_empty() {
-                    db.put(wtxn, &docid, &writer.into_inner().unwrap())?;
+                    let uncompressed_document_bytes = writer.into_inner().unwrap();
+                    match dictionary.as_ref() {
+                        Some(dictionary) => {
+                            let compressed = CompressedKvWriterU16::new_with_dictionary(
+                                &uncompressed_document_bytes,
+                                dictionary,
+                            )?;
+                            db.put(wtxn, &docid, compressed.as_bytes())?
+                        }
+                        None => db.put(wtxn, &docid, &uncompressed_document_bytes)?,
+                    }
                    operations.push(DocumentOperation {
                        external_id: external_id.to_string(),
                        internal_id: docid,
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -1769,6 +1769,8 @@ mod tests {

        // Check that the searchable field is correctly set to "name" only.
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        // When we search for something that is not in
        // the searchable fields it must not return any document.
        let result = index.search(&rtxn).query("23").execute().unwrap();
@ -1777,10 +1779,17 @@ mod tests {
        // When we search for something that is in the searchable fields
        // we must find the appropriate document.
        let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap();
-        let documents = index.documents(&rtxn, result.documents_ids).unwrap();
+        let mut compressed_documents =
+            index.compressed_documents(&rtxn, result.documents_ids).unwrap();
        let fid_map = index.fields_ids_map(&rtxn).unwrap();
-        assert_eq!(documents.len(), 1);
-        assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
+        assert_eq!(compressed_documents.len(), 1);
+        let (_id, compressed_document) = compressed_documents.remove(0);
+        let document = compressed_document
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
+
+        assert_eq!(document.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
+        drop(dictionary);
        drop(rtxn);

        // We change the searchable fields to be the "name" field only.
@ -1805,6 +1814,7 @@ mod tests {

        // Check that the searchable field have been reset and documents are found now.
        let rtxn = index.read_txn().unwrap();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let fid_map = index.fields_ids_map(&rtxn).unwrap();
        let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap();
        snapshot!(format!("{user_defined_searchable_fields:?}"), @"None");
@ -1813,8 +1823,13 @@ mod tests {
        snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###);
        let result = index.search(&rtxn).query("23").execute().unwrap();
        assert_eq!(result.documents_ids.len(), 1);
-        let documents = index.documents(&rtxn, result.documents_ids).unwrap();
-        assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
+        let mut compressed_documents =
+            index.compressed_documents(&rtxn, result.documents_ids).unwrap();
+        let (_id, compressed_document) = compressed_documents.remove(0);
+        let document = compressed_document
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();
+        assert_eq!(document.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
    }

    #[test]
@ -1949,15 +1964,20 @@ mod tests {

        // Check that the displayed fields are correctly set.
        let rtxn = index.read_txn().unwrap();
+        let mut buffer = Vec::new();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let fields_ids = index.filterable_fields(&rtxn).unwrap();
        assert_eq!(fields_ids, hashset! { S("age") });
        // Only count the field_id 0 and level 0 facet values.
        // TODO we must support typed CSVs for numbers to be understood.
        let fidmap = index.fields_ids_map(&rtxn).unwrap();
-        for document in index.all_documents(&rtxn).unwrap() {
-            let document = document.unwrap();
-            let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1)
+        for result in index.all_compressed_documents(&rtxn).unwrap() {
+            let (_id, compressed_document) = result.unwrap();
+            let document = compressed_document
+                .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
                .unwrap();
+            let json =
+                crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document).unwrap();
            println!("json: {:?}", json);
        }
        let count = index
@ -1968,6 +1988,7 @@ mod tests {
            .unwrap()
            .count();
        assert_eq!(count, 3);
+        drop(dictionary);
        drop(rtxn);

        // Index a little more documents with new and current facets values.
@ -2057,6 +2078,7 @@ mod tests {
    #[test]
    fn set_asc_desc_field() {
        let mut index = TempIndex::new();
+        let mut buffer = Vec::new();
        index.index_documents_config.autogenerate_docids = true;

        // Set the filterable fields to be the age.
@ -2078,12 +2100,16 @@ mod tests {

        // Run an empty query just to ensure that the search results are ordered.
        let rtxn = index.read_txn().unwrap();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
        let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
-        let documents = index.documents(&rtxn, documents_ids).unwrap();
+        let compressed_documents = index.compressed_documents(&rtxn, documents_ids).unwrap();

        // Fetch the documents "age" field in the ordre in which the documents appear.
        let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap();
-        let iter = documents.into_iter().map(|(_, doc)| {
+        let iter = compressed_documents.into_iter().map(|(_, compressed_doc)| {
+            let doc = compressed_doc
+                .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                .unwrap();
            let bytes = doc.get(age_field_id).unwrap();
            let string = std::str::from_utf8(bytes).unwrap();
            string.parse::<u32>().unwrap()
@ -2480,6 +2506,7 @@ mod tests {
    #[test]
    fn setting_impact_relevancy() {
        let mut index = TempIndex::new();
+        let mut buffer = Vec::new();
        index.index_documents_config.autogenerate_docids = true;

        // Set the genres setting
@ -2512,8 +2539,12 @@ mod tests {
        let rtxn = index.read_txn().unwrap();
        let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap();
        let first_id = documents_ids[0];
-        let documents = index.documents(&rtxn, documents_ids).unwrap();
-        let (_, content) = documents.iter().find(|(id, _)| *id == first_id).unwrap();
+        let documents = index.compressed_documents(&rtxn, documents_ids).unwrap();
+        let (_, compressed_content) = documents.iter().find(|(id, _)| *id == first_id).unwrap();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+        let content = compressed_content
+            .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+            .unwrap();

        let fid = index.fields_ids_map(&rtxn).unwrap().id("title").unwrap();
        let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap();
@ -2681,7 +2712,7 @@ mod tests {
        wtxn.commit().unwrap();

        let rtxn = index.write_txn().unwrap();
-        let docs: StdResult<Vec<_>, _> = index.all_documents(&rtxn).unwrap().collect();
+        let docs: StdResult<Vec<_>, _> = index.all_compressed_documents(&rtxn).unwrap().collect();
        let docs = docs.unwrap();
        assert_eq!(docs.len(), 5);
    }
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@ -317,7 +317,20 @@ fn criteria_ascdesc() {
    wtxn.commit().unwrap();

    let rtxn = index.read_txn().unwrap();
-    let documents = index.all_documents(&rtxn).unwrap().map(|doc| doc.unwrap()).collect::<Vec<_>>();
+    let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+    let mut buffers = vec![Vec::new(); index.number_of_documents(&rtxn).unwrap() as usize];
+    let documents = index
+        .all_compressed_documents(&rtxn)
+        .unwrap()
+        .zip(buffers.iter_mut())
+        .map(|(compressed, buffer)| {
+            let (id, compressed) = compressed.unwrap();
+            let doc = compressed
+                .decompress_with_optional_dictionary(buffer, dictionary.as_ref())
+                .unwrap();
+            (id, doc)
+        })
+        .collect::<Vec<_>>();

    for criterion in [Asc(S("name")), Desc(S("name")), Asc(S("age")), Desc(S("age"))] {
        eprintln!("Testing with criterion: {:?}", &criterion);
Author	SHA1	Message	Date
Clément Renault	deee22b5da	Use an experimental feature to avoid copying 64k in memory	2024-07-10 16:42:01 +02:00
Clément Renault	fd8c90b858	Clean up some parts of the code	2024-07-10 16:37:21 +02:00
Clément Renault	4ceade43cd	Make the tests pass	2024-07-10 16:37:21 +02:00
Clément Renault	e95e47d258	Simplify optional document decompression usage	2024-07-10 16:37:21 +02:00
Clément Renault	e18b06ddda	Use the zstd library directly to be able to define the compression level	2024-07-10 16:37:20 +02:00
Clément Renault	b15e8aacb6	Fix merging of documents to support compressed documents	2024-07-10 16:34:45 +02:00
Clément Renault	767f20e30d	Generate the dictionary from the first 10k documents	2024-07-10 16:34:45 +02:00
Clément Renault	0d63d02ab2	Prefer encoding the output size when compressing documents	2024-07-10 16:33:39 +02:00
Clément Renault	bf5d9f68fa	First version compressing the documents	2024-07-10 16:33:39 +02:00
Clément Renault	e9d6b4222b	First compiling version with compressed documents iterators	2024-07-10 16:33:39 +02:00
Clément Renault	2f0567fad1	Introduce the compressed obkv readers and writers	2024-07-10 16:32:22 +02:00