Remove limit of 1000 position per attribute

Instead of using an arbitrary limit we encode the absolute position in a u32 using one strong u16 for the field id and a weak u16 for the relative position in the attribute.
2025-11-04 18:06:28 +00:00 · 2021-09-22 17:48:24 +02:00
parent 8f6b6c9042
commit 360c5ff3df
6 changed files with 91 additions and 24 deletions
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -10,8 +10,7 @@ use serde_json::Value;

 use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
 use crate::error::{InternalError, SerializationError};
-use crate::proximity::ONE_ATTRIBUTE;
-use crate::{FieldId, Result};
+use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};

 /// Extracts the word and positions where this word appear and
 /// prefixes it by the document id.
@@ -63,7 +62,7 @@ pub fn extract_docid_word_positions<R: io::Read>(
                if let Some(field) = json_to_string(&value, &mut field_buffer) {
                    let analyzed = analyzer.analyze(field);
                    let tokens = process_tokens(analyzed.tokens())
-                        .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE);
+                        .take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE);

                    for (index, token) in tokens {
                        let token = token.text().trim();
@@ -71,10 +70,10 @@ pub fn extract_docid_word_positions<R: io::Read>(
                            key_buffer.truncate(mem::size_of::<u32>());
                            key_buffer.extend_from_slice(token.as_bytes());

-                            let position: u32 = index
+                            let position: u16 = index
                                .try_into()
                                .map_err(|_| SerializationError::InvalidNumberSerialization)?;
-                            let position = field_id as u32 * ONE_ATTRIBUTE + position;
+                            let position = absolute_from_relative_position(field_id, position);
                            docid_word_positions_sorter
                                .insert(&key_buffer, &position.to_ne_bytes())?;
                        }
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -10,8 +10,7 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::proximity::extract_position;
-use crate::{DocumentId, FieldId, Result};
+use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};

 /// Extracts the field id word count and the documents ids where
 /// this field id with this amount of words appear.
@@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
        }

        for position in read_u32_ne_bytes(value) {
-            let (field_id, position) = extract_position(position);
-            let word_count = position + 1;
+            let (field_id, position) = relative_from_absolute_position(position);
+            let word_count = position as u32 + 1;

            let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
            *value = cmp::max(*value, word_count);