mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-27 00:31:02 +00:00
Introduce an heed codec that reduce the size of small amount of serialized integers
This commit is contained in:
committed by
Clément Renault
parent
3e2250423c
commit
5664c37539
@ -21,7 +21,7 @@ use rayon::prelude::*;
|
||||
use roaring::RoaringBitmap;
|
||||
use structopt::StructOpt;
|
||||
|
||||
use milli::heed_codec::CsvStringRecordCodec;
|
||||
use milli::heed_codec::{CsvStringRecordCodec, ByteorderXRoaringBitmapCodec};
|
||||
use milli::tokenizer::{simple_tokenizer, only_words};
|
||||
use milli::{SmallVec32, Index, DocumentId, BEU32};
|
||||
|
||||
@ -197,7 +197,6 @@ impl Store {
|
||||
{
|
||||
// postings positions ids keys are all prefixed
|
||||
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
// We prefix the words by the document id.
|
||||
key.extend_from_slice(&id.to_be_bytes());
|
||||
@ -207,12 +206,11 @@ impl Store {
|
||||
key.truncate(base_size);
|
||||
key.extend_from_slice(word.as_bytes());
|
||||
// We serialize the positions into a buffer.
|
||||
buffer.clear();
|
||||
buffer.reserve(positions.serialized_size());
|
||||
positions.serialize_into(&mut buffer)?;
|
||||
let bytes = ByteorderXRoaringBitmapCodec::bytes_encode(&positions)
|
||||
.with_context(|| format!("could not serialize positions"))?;
|
||||
// that we write under the generated key into MTBL
|
||||
if lmdb_key_valid_size(&key) {
|
||||
sorter.insert(&key, &buffer)?;
|
||||
sorter.insert(&key, &bytes)?;
|
||||
}
|
||||
}
|
||||
|
||||
@ -309,7 +307,11 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
||||
Ok(values[0].to_vec())
|
||||
},
|
||||
key => match key[0] {
|
||||
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORD_DOCID_POSITIONS_BYTE => {
|
||||
WORD_DOCID_POSITIONS_BYTE => {
|
||||
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
||||
Ok(values[0].to_vec())
|
||||
},
|
||||
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => {
|
||||
let (head, tail) = values.split_first().unwrap();
|
||||
|
||||
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
||||
|
@ -257,13 +257,13 @@ fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::
|
||||
|
||||
fn average_number_of_positions(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
||||
use heed::types::DecodeIgnore;
|
||||
use milli::RoaringBitmapCodec;
|
||||
use milli::ByteorderXRoaringBitmapCodec;
|
||||
|
||||
let mut values_length = Vec::new();
|
||||
let mut count = 0;
|
||||
|
||||
let iter = index.docid_word_positions.as_polymorph().iter::<_, DecodeIgnore, RoaringBitmapCodec>(rtxn)?;
|
||||
for result in iter {
|
||||
let db = index.docid_word_positions.as_polymorph();
|
||||
for result in db.iter::<_, DecodeIgnore, ByteorderXRoaringBitmapCodec>(rtxn)? {
|
||||
let ((), val) = result?;
|
||||
values_length.push(val.len() as u32);
|
||||
count += 1;
|
||||
|
Reference in New Issue
Block a user