Fix search highlight for non-unicode chars

The `matching_bytes` function takes a `&Token` now and:
- gets the number of bytes to highlight (unchanged).
- uses `Token.num_graphemes_from_bytes` to get the number of grapheme
  clusters to highlight.

In essence, the `matching_bytes` function returns the number of matching
grapheme clusters instead of bytes. Should this function be renamed
then?

Added proper highlighting in the HTTP UI:
- requires dependency on `unicode-segmentation` to extract grapheme
  clusters from tokens
- `<mark>` tag is put around only the matched part
    - before this change, the entire word was highlighted even if only a
      part of it matched
This commit is contained in:
Samyak S Sarnayak
2021-12-17 22:53:34 +05:30
parent 559e019de1
commit 30247d70cd
3 changed files with 26 additions and 12 deletions

View File

@@ -34,6 +34,7 @@ use structopt::StructOpt;
use tokio::fs::File as TFile;
use tokio::io::AsyncWriteExt;
use tokio::sync::broadcast;
use unicode_segmentation::UnicodeSegmentation;
use warp::filters::ws::Message;
use warp::http::Response;
use warp::Filter;
@@ -160,13 +161,21 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
let analyzed = self.analyzer.analyze(&old_string);
for (word, token) in analyzed.reconstruct() {
if token.is_word() {
let to_highlight = matching_words.matching_bytes(token.text()).is_some();
if to_highlight {
string.push_str("<mark>")
}
string.push_str(word);
if to_highlight {
string.push_str("</mark>")
let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0);
if chars_to_highlight > 0 {
let graphemes = word.graphemes(true);
let chars = graphemes.clone().into_iter();
string.push_str("<mark>");
string.push_str(
chars.take(chars_to_highlight).collect::<String>().as_str(),
);
string.push_str("</mark>");
let chars = graphemes.into_iter().skip(chars_to_highlight);
string.push_str(chars.collect::<String>().as_str());
} else {
string.push_str(word);
}
} else {
string.push_str(word);