mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-28 01:01:00 +00:00
feat: Simplify the levenshtein construction
This commit is contained in:
@ -1,4 +1,7 @@
|
||||
cargo-features = ["edition"]
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "raptor"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
@ -6,14 +9,15 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
[dependencies]
|
||||
byteorder = "1.2"
|
||||
fnv = "1.0"
|
||||
lazy_static = "1.1"
|
||||
|
||||
[dependencies.fst]
|
||||
git = "https://github.com/Kerollmops/fst.git"
|
||||
branch = "always-match-clone"
|
||||
branch = "automaton-for-deref"
|
||||
|
||||
[dependencies.levenshtein_automata]
|
||||
git = "https://github.com/Kerollmops/levenshtein-automata.git"
|
||||
branch = "custom-fst"
|
||||
branch = "new-custom-fst"
|
||||
features = ["fst_automaton"]
|
||||
|
||||
[dependencies.rocksdb]
|
||||
|
50
raptor/src/automaton.rs
Normal file
50
raptor/src/automaton.rs
Normal file
@ -0,0 +1,50 @@
|
||||
use std::ops::Deref;
|
||||
use fst::Automaton;
|
||||
use levenshtein_automata::{
|
||||
LevenshteinAutomatonBuilder as LevBuilder,
|
||||
DFA, Distance,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
|
||||
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
|
||||
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
|
||||
}
|
||||
|
||||
pub struct DfaExt {
|
||||
query_len: usize,
|
||||
automaton: DFA,
|
||||
}
|
||||
|
||||
impl Deref for DfaExt {
|
||||
type Target = DFA;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.automaton
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(query: &str) -> DfaExt {
|
||||
let dfa = match query.len() {
|
||||
0 ..= 4 => LEVDIST0.build_prefix_dfa(query),
|
||||
5 ..= 8 => LEVDIST1.build_prefix_dfa(query),
|
||||
_ => LEVDIST2.build_prefix_dfa(query),
|
||||
};
|
||||
|
||||
DfaExt { query_len: query.len(), automaton: dfa }
|
||||
}
|
||||
|
||||
pub trait AutomatonExt: Automaton {
|
||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
|
||||
fn query_len(&self) -> usize;
|
||||
}
|
||||
|
||||
impl AutomatonExt for DfaExt {
|
||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
|
||||
self.automaton.eval(s)
|
||||
}
|
||||
|
||||
fn query_len(&self) -> usize {
|
||||
self.query_len
|
||||
}
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
||||
|
||||
pub struct LevBuilder {
|
||||
automatons: [LevenshteinAutomatonBuilder; 3],
|
||||
}
|
||||
|
||||
impl LevBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
automatons: [
|
||||
LevenshteinAutomatonBuilder::new(0, false),
|
||||
LevenshteinAutomatonBuilder::new(1, false),
|
||||
LevenshteinAutomatonBuilder::new(2, false),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_automaton(&self, query: &str) -> Levenshtein {
|
||||
assert!(!query.is_empty());
|
||||
|
||||
let dfa = if query.len() <= 4 {
|
||||
self.automatons[0].build_prefix_dfa(query)
|
||||
} else if query.len() <= 8 {
|
||||
self.automatons[1].build_prefix_dfa(query)
|
||||
} else {
|
||||
self.automatons[2].build_prefix_dfa(query)
|
||||
};
|
||||
|
||||
Levenshtein { dfa, query_len: query.len() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Levenshtein {
|
||||
pub dfa: DFA,
|
||||
pub query_len: usize,
|
||||
}
|
@ -1,24 +1,16 @@
|
||||
#![feature(nll)]
|
||||
|
||||
extern crate fst;
|
||||
extern crate fnv;
|
||||
extern crate group_by;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate byteorder;
|
||||
extern crate rocksdb;
|
||||
#[macro_use] extern crate lazy_static;
|
||||
|
||||
pub mod rank;
|
||||
pub mod metadata;
|
||||
pub mod levenshtein;
|
||||
pub mod automaton;
|
||||
|
||||
pub use self::metadata::{
|
||||
Metadata, MetadataBuilder,
|
||||
StreamWithState, StreamWithStateBuilder,
|
||||
UnionWithState, OpWithStateBuilder,
|
||||
IndexedValuesWithState,
|
||||
Stream, StreamBuilder,
|
||||
Union, OpBuilder,
|
||||
IndexedValues,
|
||||
};
|
||||
pub use self::rank::{RankedStream};
|
||||
pub use self::levenshtein::LevBuilder;
|
||||
pub use self::rank::RankedStream;
|
||||
|
||||
pub type DocumentId = u64;
|
||||
|
||||
|
@ -9,7 +9,7 @@ use std::mem;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{self, Map, MapBuilder, Automaton};
|
||||
use fst::raw::MmapReadOnly;
|
||||
use DocIndex;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[repr(C)]
|
||||
struct Range {
|
||||
@ -256,23 +256,23 @@ unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
||||
pub struct OpWithStateBuilder<'m, 'v, U> {
|
||||
inner: fst::map::OpWithStateBuilder<'m, U>,
|
||||
pub struct OpBuilder<'m, 'v> {
|
||||
inner: fst::map::OpBuilder<'m>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
||||
impl<'m, 'v> OpBuilder<'m, 'v> {
|
||||
pub fn new(indexes: &'v DocIndexes) -> Self {
|
||||
Self {
|
||||
inner: fst::map::OpWithStateBuilder::new(),
|
||||
inner: fst::map::OpBuilder::new(),
|
||||
indexes: indexes,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add<I, S>(mut self, streamable: I) -> Self
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
|
||||
{
|
||||
self.push(streamable);
|
||||
self
|
||||
@ -280,14 +280,14 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
||||
|
||||
pub fn push<I, S>(&mut self, streamable: I)
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
|
||||
{
|
||||
self.inner.push(streamable);
|
||||
}
|
||||
|
||||
pub fn union(self) -> UnionWithState<'m, 'v, U> {
|
||||
UnionWithState {
|
||||
pub fn union(self) -> Union<'m, 'v> {
|
||||
Union {
|
||||
inner: self.inner.union(),
|
||||
outs: Vec::new(),
|
||||
indexes: self.indexes,
|
||||
@ -296,23 +296,19 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
|
||||
pub struct IndexedValuesWithState<'a, U> {
|
||||
pub struct IndexedValues<'a> {
|
||||
pub index: usize,
|
||||
pub values: &'a [DocIndex],
|
||||
pub state: U,
|
||||
}
|
||||
|
||||
pub struct UnionWithState<'m, 'v, U> {
|
||||
inner: fst::map::UnionWithState<'m, U>,
|
||||
outs: Vec<IndexedValuesWithState<'v, U>>,
|
||||
pub struct Union<'m, 'v> {
|
||||
inner: fst::map::Union<'m>,
|
||||
outs: Vec<IndexedValues<'v>>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'a, 'm, 'v, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, U>
|
||||
where
|
||||
U: Clone,
|
||||
{
|
||||
type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, U>]);
|
||||
impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> {
|
||||
type Item = (&'a [u8], &'a [IndexedValues<'a>]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.inner.next() {
|
||||
@ -322,8 +318,7 @@ where
|
||||
for ivalue in ivalues {
|
||||
if let Some(values) = self.indexes.get(ivalue.value) {
|
||||
let index = ivalue.index;
|
||||
let state = ivalue.state.clone();
|
||||
self.outs.push(IndexedValuesWithState { index, values, state })
|
||||
self.outs.push(IndexedValues { index, values })
|
||||
}
|
||||
}
|
||||
Some((s, &self.outs))
|
||||
@ -333,44 +328,43 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StreamWithStateBuilder<'m, 'v, A> {
|
||||
inner: fst::map::StreamWithStateBuilder<'m, A>,
|
||||
pub struct StreamBuilder<'m, 'v, A> {
|
||||
inner: fst::map::StreamBuilder<'m, A>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, A>
|
||||
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
{
|
||||
type Item = <Self::Into as fst::Streamer<'a>>::Item;
|
||||
type Into = StreamWithState<'m, 'v, A>;
|
||||
type Into = Stream<'m, 'v, A>;
|
||||
|
||||
fn into_stream(self) -> Self::Into {
|
||||
StreamWithState {
|
||||
Stream {
|
||||
inner: self.inner.into_stream(),
|
||||
indexes: self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StreamWithState<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
|
||||
inner: fst::map::StreamWithState<'m, A>,
|
||||
pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
|
||||
inner: fst::map::Stream<'m, A>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, A>
|
||||
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
{
|
||||
type Item = (&'a [u8], &'a [DocIndex], A::State);
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.inner.next() {
|
||||
Some((key, i, state)) => {
|
||||
Some((key, i)) => {
|
||||
match self.indexes.get(i) {
|
||||
Some(values) => Some((key, values, state)),
|
||||
Some(values) => Some((key, values)),
|
||||
None => None,
|
||||
}
|
||||
},
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn contains_exact(matches: &[Match]) -> bool {
|
||||
|
@ -6,13 +6,14 @@ mod sum_of_words_position;
|
||||
mod exact;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::rc::Rc;
|
||||
use std::{mem, vec};
|
||||
use fst;
|
||||
use fnv::FnvHashMap;
|
||||
use levenshtein::Levenshtein;
|
||||
use metadata::{DocIndexes, OpWithStateBuilder, UnionWithState};
|
||||
use {Match, DocumentId};
|
||||
use group_by::GroupByMut;
|
||||
use crate::automaton::{DfaExt, AutomatonExt};
|
||||
use crate::metadata::{DocIndexes, OpBuilder, Union};
|
||||
use crate::{Match, DocumentId};
|
||||
|
||||
use self::{
|
||||
sum_of_typos::sum_of_typos,
|
||||
@ -85,11 +86,12 @@ fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize)
|
||||
pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>);
|
||||
|
||||
impl<'m, 'v> RankedStream<'m, 'v> {
|
||||
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<Levenshtein>, limit: usize) -> Self {
|
||||
let mut op = OpWithStateBuilder::new(indexes);
|
||||
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<DfaExt>, limit: usize) -> Self {
|
||||
let mut op = OpBuilder::new(indexes);
|
||||
|
||||
for automaton in automatons.iter().map(|l| l.dfa.clone()) {
|
||||
let stream = map.search(automaton).with_state();
|
||||
let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
|
||||
for automaton in automatons.iter().cloned() {
|
||||
let stream = map.search(automaton);
|
||||
op.push(stream);
|
||||
}
|
||||
|
||||
@ -114,8 +116,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {
|
||||
|
||||
enum RankedStreamInner<'m, 'v> {
|
||||
Fed {
|
||||
inner: UnionWithState<'m, 'v, u32>,
|
||||
automatons: Vec<Levenshtein>,
|
||||
inner: Union<'m, 'v>,
|
||||
automatons: Vec<Rc<DfaExt>>,
|
||||
limit: usize,
|
||||
matches: FnvHashMap<DocumentId, Vec<Match>>,
|
||||
},
|
||||
@ -136,7 +138,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
|
||||
for iv in indexed_values {
|
||||
|
||||
let automaton = &automatons[iv.index];
|
||||
let distance = automaton.dfa.distance(iv.state).to_u8();
|
||||
let distance = automaton.eval(string).to_u8();
|
||||
let same_length = string.len() == automaton.query_len();
|
||||
|
||||
for di in iv.values {
|
||||
let match_ = Match {
|
||||
@ -144,11 +147,11 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
|
||||
distance: distance,
|
||||
attribute: di.attribute,
|
||||
attribute_index: di.attribute_index,
|
||||
is_exact: distance == 0 && string.len() == automaton.query_len,
|
||||
is_exact: distance == 0 && same_length,
|
||||
};
|
||||
matches.entry(di.document)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(match_);
|
||||
.or_insert_with(Vec::new)
|
||||
.push(match_);
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(matches: &[Match]) -> u8 {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(matches: &[Match]) -> u8 {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
const MAX_DISTANCE: u32 = 8;
|
||||
|
||||
|
Reference in New Issue
Block a user