mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Merge #426
426: Fix search highlight for non-unicode chars r=ManyTheFish a=Samyak2 # Pull Request ## What does this PR do? Fixes https://github.com/meilisearch/MeiliSearch/issues/1480 <!-- Please link the issue you're trying to fix with this PR, if none then please create an issue first. --> ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? ## Changes The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function now returns the number of matching grapheme clusters instead of bytes. Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched ## Questions Since `matching_bytes` does not return number of bytes but grapheme clusters, should it be renamed to something like `matching_chars` or `matching_graphemes`? Will this break the API? Thank you very much `@ManyTheFish` for helping 😄 Co-authored-by: Samyak S Sarnayak <samyak201@gmail.com>
This commit is contained in:
		| @@ -10,7 +10,7 @@ anyhow = "1.0.38" | ||||
| byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } | ||||
| crossbeam-channel = "0.5.0" | ||||
| heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } | ||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } | ||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } | ||||
| memmap2 = "0.5.0" | ||||
| milli = { path = "../milli" } | ||||
| once_cell = "1.5.2" | ||||
|   | ||||
| @@ -160,13 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { | ||||
|                 let analyzed = self.analyzer.analyze(&old_string); | ||||
|                 for (word, token) in analyzed.reconstruct() { | ||||
|                     if token.is_word() { | ||||
|                         let to_highlight = matching_words.matching_bytes(token.text()).is_some(); | ||||
|                         if to_highlight { | ||||
|                             string.push_str("<mark>") | ||||
|                         match matching_words.matching_bytes(&token) { | ||||
|                             Some(chars_to_highlight) => { | ||||
|                                 let mut chars = word.chars(); | ||||
|  | ||||
|                                 string.push_str("<mark>"); | ||||
|                                 // push the part to highlight | ||||
|                                 string.extend(chars.by_ref().take(chars_to_highlight)); | ||||
|                                 string.push_str("</mark>"); | ||||
|                                 // push the suffix after highlight | ||||
|                                 string.extend(chars); | ||||
|                             } | ||||
|                         string.push_str(word); | ||||
|                         if to_highlight { | ||||
|                             string.push_str("</mark>") | ||||
|                             // no highlight | ||||
|                             None => string.push_str(word), | ||||
|                         } | ||||
|                     } else { | ||||
|                         string.push_str(word); | ||||
|   | ||||
| @@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe | ||||
| human_format = "1.0.3" | ||||
| levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | ||||
| linked-hash-map = "0.5.4" | ||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } | ||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } | ||||
| memmap2 = "0.5.0" | ||||
| obkv = "0.2.0" | ||||
| once_cell = "1.5.2" | ||||
|   | ||||
| @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet}; | ||||
| use std::ops::{Index, IndexMut}; | ||||
|  | ||||
| use levenshtein_automata::{Distance, DFA}; | ||||
| use meilisearch_tokenizer::Token; | ||||
|  | ||||
| use super::build_dfa; | ||||
| use crate::search::query_tree::{Operation, Query}; | ||||
| @@ -33,15 +34,15 @@ impl MatchingWords { | ||||
|     } | ||||
|  | ||||
|     /// Returns the number of matching bytes if the word matches one of the query words. | ||||
|     pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> { | ||||
|     pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> { | ||||
|         self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { | ||||
|             match dfa.eval(word_to_highlight) { | ||||
|             match dfa.eval(word_to_highlight.text()) { | ||||
|                 Distance::Exact(t) if t <= *typo => { | ||||
|                     if *is_prefix { | ||||
|                         let len = bytes_to_highlight(word_to_highlight, query_word); | ||||
|                         Some(len) | ||||
|                         let len = bytes_to_highlight(word_to_highlight.text(), query_word); | ||||
|                         Some(word_to_highlight.num_chars_from_bytes(len)) | ||||
|                     } else { | ||||
|                         Some(word_to_highlight.len()) | ||||
|                         Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len())) | ||||
|                     } | ||||
|                 } | ||||
|                 _otherwise => None, | ||||
| @@ -178,8 +179,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize { | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use std::borrow::Cow; | ||||
|     use std::str::from_utf8; | ||||
|  | ||||
|     use meilisearch_tokenizer::TokenKind; | ||||
|  | ||||
|     use super::*; | ||||
|     use crate::search::query_tree::{Operation, Query, QueryKind}; | ||||
|     use crate::MatchingWords; | ||||
| @@ -269,12 +273,82 @@ mod tests { | ||||
|  | ||||
|         let matching_words = MatchingWords::from_query_tree(&query_tree); | ||||
|  | ||||
|         assert_eq!(matching_words.matching_bytes("word"), Some(3)); | ||||
|         assert_eq!(matching_words.matching_bytes("nyc"), None); | ||||
|         assert_eq!(matching_words.matching_bytes("world"), Some(5)); | ||||
|         assert_eq!(matching_words.matching_bytes("splitted"), Some(5)); | ||||
|         assert_eq!(matching_words.matching_bytes("thisnew"), None); | ||||
|         assert_eq!(matching_words.matching_bytes("borld"), Some(5)); | ||||
|         assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4)); | ||||
|         assert_eq!( | ||||
|             matching_words.matching_bytes(&Token { | ||||
|                 kind: TokenKind::Word, | ||||
|                 word: Cow::Borrowed("word"), | ||||
|                 byte_start: 0, | ||||
|                 char_index: 0, | ||||
|                 byte_end: "word".len(), | ||||
|                 char_map: None, | ||||
|             }), | ||||
|             Some(3) | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words.matching_bytes(&Token { | ||||
|                 kind: TokenKind::Word, | ||||
|                 word: Cow::Borrowed("nyc"), | ||||
|                 byte_start: 0, | ||||
|                 char_index: 0, | ||||
|                 byte_end: "nyc".len(), | ||||
|                 char_map: None, | ||||
|             }), | ||||
|             None | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words.matching_bytes(&Token { | ||||
|                 kind: TokenKind::Word, | ||||
|                 word: Cow::Borrowed("world"), | ||||
|                 byte_start: 0, | ||||
|                 char_index: 0, | ||||
|                 byte_end: "world".len(), | ||||
|                 char_map: None, | ||||
|             }), | ||||
|             Some(5) | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words.matching_bytes(&Token { | ||||
|                 kind: TokenKind::Word, | ||||
|                 word: Cow::Borrowed("splitted"), | ||||
|                 byte_start: 0, | ||||
|                 char_index: 0, | ||||
|                 byte_end: "splitted".len(), | ||||
|                 char_map: None, | ||||
|             }), | ||||
|             Some(5) | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words.matching_bytes(&Token { | ||||
|                 kind: TokenKind::Word, | ||||
|                 word: Cow::Borrowed("thisnew"), | ||||
|                 byte_start: 0, | ||||
|                 char_index: 0, | ||||
|                 byte_end: "thisnew".len(), | ||||
|                 char_map: None, | ||||
|             }), | ||||
|             None | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words.matching_bytes(&Token { | ||||
|                 kind: TokenKind::Word, | ||||
|                 word: Cow::Borrowed("borld"), | ||||
|                 byte_start: 0, | ||||
|                 char_index: 0, | ||||
|                 byte_end: "borld".len(), | ||||
|                 char_map: None, | ||||
|             }), | ||||
|             Some(5) | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words.matching_bytes(&Token { | ||||
|                 kind: TokenKind::Word, | ||||
|                 word: Cow::Borrowed("wordsplit"), | ||||
|                 byte_start: 0, | ||||
|                 char_index: 0, | ||||
|                 byte_end: "wordsplit".len(), | ||||
|                 char_map: None, | ||||
|             }), | ||||
|             Some(4) | ||||
|         ); | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user