mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	feat: Make WordArea be based on char index and length
This commit is contained in:
		| @@ -48,6 +48,24 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { | |||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) { | ||||||
|  |     let mut byte_index = 0; | ||||||
|  |     let mut byte_length = 0; | ||||||
|  |  | ||||||
|  |     for (n, (i, c)) in text.char_indices().enumerate() { | ||||||
|  |         if n == index { | ||||||
|  |             byte_index = i; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if n + 1 == index + length { | ||||||
|  |             byte_length = i - byte_index + c.len_utf8(); | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     (byte_index, byte_length) | ||||||
|  | } | ||||||
|  |  | ||||||
| fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> { | fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> { | ||||||
|     let mut byte_indexes = BTreeMap::new(); |     let mut byte_indexes = BTreeMap::new(); | ||||||
|  |  | ||||||
| @@ -55,11 +73,18 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) | |||||||
|         let match_attribute = match_.attribute.attribute(); |         let match_attribute = match_.attribute.attribute(); | ||||||
|         if SchemaAttr::new(match_attribute) == attribute { |         if SchemaAttr::new(match_attribute) == attribute { | ||||||
|             let word_area = match_.word_area; |             let word_area = match_.word_area; | ||||||
|             let byte_index = word_area.byte_index() as usize; |  | ||||||
|             let length = word_area.length() as usize; |             let char_index = word_area.char_index() as usize; | ||||||
|  |             let char_length = word_area.length() as usize; | ||||||
|  |             let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); | ||||||
|  |  | ||||||
|             match byte_indexes.entry(byte_index) { |             match byte_indexes.entry(byte_index) { | ||||||
|                 Entry::Vacant(entry) => { entry.insert(length); }, |                 Entry::Vacant(entry) => { entry.insert(byte_length); }, | ||||||
|                 Entry::Occupied(mut entry) => if *entry.get() < length { entry.insert(length); }, |                 Entry::Occupied(mut entry) => { | ||||||
|  |                     if *entry.get() < byte_length { | ||||||
|  |                         entry.insert(byte_length); | ||||||
|  |                     } | ||||||
|  |                 }, | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -51,24 +51,14 @@ where B: TokenizerBuilder | |||||||
|     fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { |     fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { | ||||||
|         for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { |         for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { | ||||||
|  |  | ||||||
|  |             let document_id = self.document_id; | ||||||
|  |  | ||||||
|             // FIXME must u32::try_from instead |             // FIXME must u32::try_from instead | ||||||
|             let attribute = match Attribute::new(self.attribute.0, word_index as u32) { |             let attribute = match Attribute::new(self.attribute.0, word_index as u32) { | ||||||
|                 Ok(attribute) => attribute, |                 Ok(attribute) => attribute, | ||||||
|                 Err(_) => return Ok(()), |                 Err(_) => return Ok(()), | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             // FIXME must u16/u32::try_from instead |  | ||||||
|             let word_area = match WordArea::new(char_index as u32, word.len() as u16) { |  | ||||||
|                 Ok(word_area) => word_area, |  | ||||||
|                 Err(_) => return Ok(()), |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             let doc_index = DocIndex { |  | ||||||
|                 document_id: self.document_id, |  | ||||||
|                 attribute, |  | ||||||
|                 word_area |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             // insert the exact representation |             // insert the exact representation | ||||||
|             let word_lower = word.to_lowercase(); |             let word_lower = word.to_lowercase(); | ||||||
|  |  | ||||||
| @@ -77,9 +67,26 @@ where B: TokenizerBuilder | |||||||
|             // and the unidecoded lowercased version |             // and the unidecoded lowercased version | ||||||
|             let word_unidecoded = unidecode::unidecode(word).to_lowercase(); |             let word_unidecoded = unidecode::unidecode(word).to_lowercase(); | ||||||
|             if word_lower != word_unidecoded { |             if word_lower != word_unidecoded { | ||||||
|  |  | ||||||
|  |                 // FIXME must u16/u32::try_from instead | ||||||
|  |                 let length = word_unidecoded.chars().count() as u16; | ||||||
|  |                 let word_area = match WordArea::new(char_index as u32, length) { | ||||||
|  |                     Ok(word_area) => word_area, | ||||||
|  |                     Err(_) => return Ok(()), | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|  |                 let doc_index = DocIndex { document_id, attribute, word_area }; | ||||||
|                 self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); |                 self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  |             // FIXME must u16/u32::try_from instead | ||||||
|  |             let length = word.chars().count() as u16; | ||||||
|  |             let word_area = match WordArea::new(char_index as u32, length) { | ||||||
|  |                 Ok(word_area) => word_area, | ||||||
|  |                 Err(_) => return Ok(()), | ||||||
|  |             }; | ||||||
|  |  | ||||||
|  |             let doc_index = DocIndex { document_id, attribute, word_area }; | ||||||
|             self.update.insert_doc_index(word_lower.into_bytes(), doc_index); |             self.update.insert_doc_index(word_lower.into_bytes(), doc_index); | ||||||
|         } |         } | ||||||
|         Ok(()) |         Ok(()) | ||||||
|   | |||||||
							
								
								
									
										38
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										38
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -97,15 +97,15 @@ enum AttributeError { | |||||||
| pub struct WordArea(u32); | pub struct WordArea(u32); | ||||||
|  |  | ||||||
| impl WordArea { | impl WordArea { | ||||||
|     /// Construct a `WordArea` from a word position in bytes |     /// Construct a `WordArea` from a word position in expresed as | ||||||
|     /// and the length of it. |     /// a number of characters and the length of it. | ||||||
|     /// |     /// | ||||||
|     /// # Panics |     /// # Panics | ||||||
|     /// |     /// | ||||||
|     /// The byte index must not be greater than 2^22 |     /// The char index must not be greater than 2^22 | ||||||
|     /// and the length not greater than 1024. |     /// and the length not greater than 1024. | ||||||
|     fn new(byte_index: u32, length: u16) -> Result<WordArea, WordAreaError> { |     fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> { | ||||||
|         if byte_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { |         if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { | ||||||
|             return Err(WordAreaError::ByteIndexTooBig) |             return Err(WordAreaError::ByteIndexTooBig) | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -113,12 +113,12 @@ impl WordArea { | |||||||
|             return Err(WordAreaError::LengthTooBig) |             return Err(WordAreaError::LengthTooBig) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let byte_index = byte_index << 10; |         let char_index = char_index << 10; | ||||||
|         Ok(WordArea(byte_index | u32::from(length))) |         Ok(WordArea(char_index | u32::from(length))) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn new_faillible(byte_index: u32, length: u16) -> WordArea { |     fn new_faillible(char_index: u32, length: u16) -> WordArea { | ||||||
|         match WordArea::new(byte_index, length) { |         match WordArea::new(char_index, length) { | ||||||
|             Ok(word_area) => word_area, |             Ok(word_area) => word_area, | ||||||
|             Err(WordAreaError::ByteIndexTooBig) => { |             Err(WordAreaError::ByteIndexTooBig) => { | ||||||
|                 panic!("word area byte index must not be greater than 2^22") |                 panic!("word area byte index must not be greater than 2^22") | ||||||
| @@ -130,7 +130,7 @@ impl WordArea { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[inline] |     #[inline] | ||||||
|     pub fn byte_index(self) -> u32 { |     pub fn char_index(self) -> u32 { | ||||||
|         self.0 >> 10 |         self.0 >> 10 | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -143,7 +143,7 @@ impl WordArea { | |||||||
| impl fmt::Debug for WordArea { | impl fmt::Debug for WordArea { | ||||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|         f.debug_struct("WordArea") |         f.debug_struct("WordArea") | ||||||
|             .field("byte_index", &self.byte_index()) |             .field("char_index", &self.char_index()) | ||||||
|             .field("length", &self.length()) |             .field("length", &self.length()) | ||||||
|             .finish() |             .finish() | ||||||
|     } |     } | ||||||
| @@ -270,26 +270,26 @@ mod tests { | |||||||
|             TestResult::from_bool(a < b) |             TestResult::from_bool(a < b) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult { |         fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { | ||||||
|             if gen_byte_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { |             if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { | ||||||
|                 return TestResult::discard() |                 return TestResult::discard() | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let word_area = WordArea::new_faillible(gen_byte_index, gen_length); |             let word_area = WordArea::new_faillible(gen_char_index, gen_length); | ||||||
|  |  | ||||||
|             let valid_char_index = word_area.byte_index() == gen_byte_index; |             let valid_char_index = word_area.char_index() == gen_char_index; | ||||||
|             let valid_length = word_area.length() == gen_length; |             let valid_length = word_area.length() == gen_length; | ||||||
|  |  | ||||||
|             TestResult::from_bool(valid_char_index && valid_length) |             TestResult::from_bool(valid_char_index && valid_length) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult { |         fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { | ||||||
|             if gen_byte_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { |             if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { | ||||||
|                 return TestResult::discard() |                 return TestResult::discard() | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let a = WordArea::new_faillible(gen_byte_index, gen_length); |             let a = WordArea::new_faillible(gen_char_index, gen_length); | ||||||
|             let b = WordArea::new_faillible(gen_byte_index + 1, gen_length + 1); |             let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); | ||||||
|  |  | ||||||
|             TestResult::from_bool(a < b) |             TestResult::from_bool(a < b) | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -96,7 +96,7 @@ impl<'a> Iterator for Tokenizer<'a> { | |||||||
|                         let (spaces, word) = prefix.split_at(start_word); |                         let (spaces, word) = prefix.split_at(start_word); | ||||||
|  |  | ||||||
|                         self.inner = tail; |                         self.inner = tail; | ||||||
|                         self.char_index += spaces.len(); |                         self.char_index += spaces.chars().count(); | ||||||
|                         self.word_index += distance.map(Separator::to_usize).unwrap_or(0); |                         self.word_index += distance.map(Separator::to_usize).unwrap_or(0); | ||||||
|  |  | ||||||
|                         let token = Token { |                         let token = Token { | ||||||
| @@ -105,7 +105,7 @@ impl<'a> Iterator for Tokenizer<'a> { | |||||||
|                             char_index: self.char_index, |                             char_index: self.char_index, | ||||||
|                         }; |                         }; | ||||||
|  |  | ||||||
|                         self.char_index += word.len(); |                         self.char_index += word.chars().count(); | ||||||
|                         return Some(token) |                         return Some(token) | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
| @@ -122,7 +122,7 @@ impl<'a> Iterator for Tokenizer<'a> { | |||||||
|             let token = Token { |             let token = Token { | ||||||
|                 word: word, |                 word: word, | ||||||
|                 word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0), |                 word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0), | ||||||
|                 char_index: self.char_index + spaces.len(), |                 char_index: self.char_index + spaces.chars().count(), | ||||||
|             }; |             }; | ||||||
|             return Some(token) |             return Some(token) | ||||||
|         } |         } | ||||||
| @@ -173,7 +173,7 @@ mod tests { | |||||||
|  |  | ||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); | ||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); | ||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 })); | ||||||
|         assert_eq!(tokenizer.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|  |  | ||||||
|         let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); |         let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); | ||||||
| @@ -181,8 +181,8 @@ mod tests { | |||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); | ||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); | ||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); | ||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 19 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 })); | ||||||
|         assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 25 })); |         assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 })); | ||||||
|         assert_eq!(tokenizer.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user