mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Remove limit of 1000 position per attribute
Instead of using an arbitrary limit we encode the absolute position in a u32 using one strong u16 for the field id and a weak u16 for the relative position in the attribute.
This commit is contained in:
		| @@ -53,9 +53,24 @@ pub type Attribute = u32; | ||||
| pub type DocumentId = u32; | ||||
| pub type FieldId = u16; | ||||
| pub type Position = u32; | ||||
| pub type RelativePosition = u16; | ||||
| pub type FieldDistribution = BTreeMap<String, u64>; | ||||
| pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; | ||||
|  | ||||
| pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; | ||||
|  | ||||
| // Convert an absolute word position into a relative position. | ||||
| // Return the field id of the attribute related to the absolute position | ||||
| // and the relative position in the attribute. | ||||
| pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) { | ||||
|     ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16) | ||||
| } | ||||
|  | ||||
| // Compute the absolute word position with the field id of the attribute and relative position in the attribute. | ||||
| pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { | ||||
|     (field_id as u32) << 16 | (relative as u32) | ||||
| } | ||||
|  | ||||
| /// Transform a raw obkv store into a JSON Object. | ||||
| pub fn obkv_to_json( | ||||
|     displayed_fields: &[FieldId], | ||||
| @@ -187,4 +202,26 @@ mod tests { | ||||
|         // the distance of hard separators is clamped to 8 anyway. | ||||
|         assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn test_relative_position_conversion() { | ||||
|         assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000)); | ||||
|         assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF)); | ||||
|         assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000)); | ||||
|         assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00)); | ||||
|         assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF)); | ||||
|         assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678)); | ||||
|         assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF)); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn test_absolute_position_conversion() { | ||||
|         assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000)); | ||||
|         assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF)); | ||||
|         assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000)); | ||||
|         assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00)); | ||||
|         assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF)); | ||||
|         assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678)); | ||||
|         assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF)); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,8 +1,7 @@ | ||||
| use std::cmp; | ||||
|  | ||||
| use crate::{Attribute, Position}; | ||||
| use crate::{relative_from_absolute_position, Position}; | ||||
|  | ||||
| pub const ONE_ATTRIBUTE: u32 = 1000; | ||||
| pub const MAX_DISTANCE: u32 = 8; | ||||
|  | ||||
| pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { | ||||
| @@ -14,19 +13,15 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { | ||||
| } | ||||
|  | ||||
| pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { | ||||
|     let (lhs_attr, lhs_index) = extract_position(lhs); | ||||
|     let (rhs_attr, rhs_index) = extract_position(rhs); | ||||
|     let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs); | ||||
|     let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs); | ||||
|     if lhs_attr != rhs_attr { | ||||
|         MAX_DISTANCE | ||||
|     } else { | ||||
|         index_proximity(lhs_index, rhs_index) | ||||
|         index_proximity(lhs_index as u32, rhs_index as u32) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn extract_position(position: Position) -> (Attribute, Position) { | ||||
|     (position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE) | ||||
| } | ||||
|  | ||||
| pub fn path_proximity(path: &[Position]) -> u32 { | ||||
|     path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>() | ||||
| } | ||||
|   | ||||
| @@ -10,7 +10,7 @@ use crate::search::criteria::{ | ||||
|     resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, | ||||
| }; | ||||
| use crate::search::query_tree::{Operation, PrimitiveQueryPart}; | ||||
| use crate::Result; | ||||
| use crate::{absolute_from_relative_position, FieldId, Result}; | ||||
|  | ||||
| pub struct Exactness<'t> { | ||||
|     ctx: &'t dyn Context<'t>, | ||||
| @@ -181,7 +181,7 @@ fn resolve_state( | ||||
|                         ctx.field_id_word_count_docids(id, query_len)? | ||||
|                     { | ||||
|                         let mut attribute_candidates_array = | ||||
|                             attribute_start_with_docids(ctx, id as u32, query)?; | ||||
|                             attribute_start_with_docids(ctx, id, query)?; | ||||
|                         attribute_candidates_array.push(attribute_allowed_docids); | ||||
|                         candidates |= intersection_of(attribute_candidates_array.iter().collect()); | ||||
|                     } | ||||
| @@ -199,8 +199,7 @@ fn resolve_state( | ||||
|             let mut candidates = RoaringBitmap::new(); | ||||
|             let attributes_ids = ctx.searchable_fields_ids()?; | ||||
|             for id in attributes_ids { | ||||
|                 let attribute_candidates_array = | ||||
|                     attribute_start_with_docids(ctx, id as u32, query)?; | ||||
|                 let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; | ||||
|                 candidates |= intersection_of(attribute_candidates_array.iter().collect()); | ||||
|             } | ||||
|  | ||||
| @@ -290,12 +289,12 @@ fn resolve_state( | ||||
|  | ||||
| fn attribute_start_with_docids( | ||||
|     ctx: &dyn Context, | ||||
|     attribute_id: u32, | ||||
|     attribute_id: FieldId, | ||||
|     query: &[ExactQueryPart], | ||||
| ) -> heed::Result<Vec<RoaringBitmap>> { | ||||
|     let mut attribute_candidates_array = Vec::new(); | ||||
|     // start from attribute first position | ||||
|     let mut pos = attribute_id * 1000; | ||||
|     let mut pos = absolute_from_relative_position(attribute_id, 0); | ||||
|     for part in query { | ||||
|         use ExactQueryPart::*; | ||||
|         match part { | ||||
|   | ||||
| @@ -10,8 +10,7 @@ use serde_json::Value; | ||||
|  | ||||
| use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; | ||||
| use crate::error::{InternalError, SerializationError}; | ||||
| use crate::proximity::ONE_ATTRIBUTE; | ||||
| use crate::{FieldId, Result}; | ||||
| use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; | ||||
|  | ||||
| /// Extracts the word and positions where this word appear and | ||||
| /// prefixes it by the document id. | ||||
| @@ -63,7 +62,7 @@ pub fn extract_docid_word_positions<R: io::Read>( | ||||
|                 if let Some(field) = json_to_string(&value, &mut field_buffer) { | ||||
|                     let analyzed = analyzer.analyze(field); | ||||
|                     let tokens = process_tokens(analyzed.tokens()) | ||||
|                         .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); | ||||
|                         .take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE); | ||||
|  | ||||
|                     for (index, token) in tokens { | ||||
|                         let token = token.text().trim(); | ||||
| @@ -71,10 +70,10 @@ pub fn extract_docid_word_positions<R: io::Read>( | ||||
|                             key_buffer.truncate(mem::size_of::<u32>()); | ||||
|                             key_buffer.extend_from_slice(token.as_bytes()); | ||||
|  | ||||
|                             let position: u32 = index | ||||
|                             let position: u16 = index | ||||
|                                 .try_into() | ||||
|                                 .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|                             let position = field_id as u32 * ONE_ATTRIBUTE + position; | ||||
|                             let position = absolute_from_relative_position(field_id, position); | ||||
|                             docid_word_positions_sorter | ||||
|                                 .insert(&key_buffer, &position.to_ne_bytes())?; | ||||
|                         } | ||||
|   | ||||
| @@ -10,8 +10,7 @@ use super::helpers::{ | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::proximity::extract_position; | ||||
| use crate::{DocumentId, FieldId, Result}; | ||||
| use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; | ||||
|  | ||||
| /// Extracts the field id word count and the documents ids where | ||||
| /// this field id with this amount of words appear. | ||||
| @@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>( | ||||
|         } | ||||
|  | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             let (field_id, position) = extract_position(position); | ||||
|             let word_count = position + 1; | ||||
|             let (field_id, position) = relative_from_absolute_position(position); | ||||
|             let word_count = position as u32 + 1; | ||||
|  | ||||
|             let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); | ||||
|             *value = cmp::max(*value, word_count); | ||||
|   | ||||
| @@ -884,6 +884,44 @@ mod tests { | ||||
|         wtxn.commit().unwrap(); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn index_more_than_1000_positions_in_a_field() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(50 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|  | ||||
|         let mut big_object = HashMap::new(); | ||||
|         big_object.insert(S("id"), "wow"); | ||||
|         let content: String = | ||||
|             (0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap(); | ||||
|         big_object.insert("content".to_string(), &content); | ||||
|  | ||||
|         let mut cursor = Cursor::new(Vec::new()); | ||||
|  | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|         builder.add_documents(big_object).unwrap(); | ||||
|         builder.finish().unwrap(); | ||||
|         cursor.set_position(0); | ||||
|         let content = DocumentBatchReader::from_reader(cursor).unwrap(); | ||||
|  | ||||
|         let builder = IndexDocuments::new(&mut wtxn, &index, 0); | ||||
|         builder.execute(content, |_, _| ()).unwrap(); | ||||
|  | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let mut rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some()); | ||||
|         assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some()); | ||||
|         assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some()); | ||||
|         assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some()); | ||||
|         assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some()); | ||||
|         assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn index_documents_with_zeroes() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user