Resolve PR comments

This commit is contained in:
many
2021-06-01 11:48:56 +02:00
parent 1df68d342a
commit 225ae6fd25
2 changed files with 10 additions and 7 deletions

View File

@@ -11,24 +11,28 @@ use super::build_dfa;
type IsPrefix = bool; type IsPrefix = bool;
/// The query tree builder is the interface to build a query tree. /// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)] #[derive(Default)]
pub struct MatchingWords { pub struct MatchingWords {
dfas: Vec<(DFA, String, u8, IsPrefix)>, dfas: Vec<(DFA, String, u8, IsPrefix)>,
} }
impl MatchingWords { impl MatchingWords {
/// Lists all words which can be considered as a match for the query tree.
pub fn from_query_tree(tree: &Operation) -> Self { pub fn from_query_tree(tree: &Operation) -> Self {
// fetch matchable words from the query tree
let mut dfas: Vec<_> = fetch_queries(tree) let mut dfas: Vec<_> = fetch_queries(tree)
.into_iter() .into_iter()
// create DFAs for each word
.map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p)) .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p))
.collect(); .collect();
// Sort word by len in DESC order prioritizing the longuest word,
// in order to highlight the longuest part of the matched word.
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len())); dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len()));
Self { dfas } Self { dfas }
} }
/// Returns the number of matching bytes if the word matches. /// Returns the number of matching bytes if the word matches one of the query words.
pub fn matching_bytes(&self, word: &str) -> Option<usize> { pub fn matching_bytes(&self, word: &str) -> Option<usize> {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) {
Distance::Exact(t) if t <= *typo => { Distance::Exact(t) if t <= *typo => {
@@ -94,6 +98,8 @@ impl<T> IndexMut<(usize, usize)> for N2Array<T> {
} }
} }
/// Returns the distance between the source word and the target word,
/// and the number of byte matching in the target word.
fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
let (n, m) = (source.len(), target.len()); let (n, m) = (source.len(), target.len());

View File

@@ -1,14 +1,11 @@
use std::collections::HashSet;
use std::{fmt, cmp, mem}; use std::{fmt, cmp, mem};
use fst::Set; use fst::Set;
use levenshtein_automata::{DFA, Distance};
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::Index; use crate::Index;
use super::build_dfa;
type IsOptionalWord = bool; type IsOptionalWord = bool;
type IsPrefix = bool; type IsPrefix = bool;
@@ -519,7 +516,7 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
mod test { mod test {
use std::collections::HashMap; use std::collections::HashMap;
use maplit::{hashmap, hashset}; use maplit::hashmap;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use rand::{Rng, SeedableRng, rngs::StdRng}; use rand::{Rng, SeedableRng, rngs::StdRng};