feat: Simplify the levenshtein construction

2025-11-06 02:46:28 +00:00 · 2018-09-09 11:13:58 +02:00
parent f0f5fc9891
commit 31e04f0120
17 changed files with 156 additions and 141 deletions
--- a/raptor/Cargo.toml
+++ b/raptor/Cargo.toml
@@ -1,4 +1,7 @@
+cargo-features = ["edition"]
+
 [package]
+edition = "2018"
 name = "raptor"
 version = "0.1.0"
 authors = ["Kerollmops <renault.cle@gmail.com>"]
@@ -6,14 +9,15 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
 [dependencies]
 byteorder = "1.2"
 fnv = "1.0"
+lazy_static = "1.1"

 [dependencies.fst]
 git = "https://github.com/Kerollmops/fst.git"
-branch = "always-match-clone"
+branch = "automaton-for-deref"

 [dependencies.levenshtein_automata]
 git = "https://github.com/Kerollmops/levenshtein-automata.git"
-branch = "custom-fst"
+branch = "new-custom-fst"
 features = ["fst_automaton"]

 [dependencies.rocksdb]
--- a/raptor/src/automaton.rs
+++ b/raptor/src/automaton.rs
@@ -0,0 +1,50 @@
+use std::ops::Deref;
+use fst::Automaton;
+use levenshtein_automata::{
+    LevenshteinAutomatonBuilder as LevBuilder,
+    DFA, Distance,
+};
+
+lazy_static! {
+    static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
+    static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
+    static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
+}
+
+pub struct DfaExt {
+    query_len: usize,
+    automaton: DFA,
+}
+
+impl Deref for DfaExt {
+    type Target = DFA;
+
+    fn deref(&self) -> &Self::Target {
+        &self.automaton
+    }
+}
+
+pub fn build(query: &str) -> DfaExt {
+    let dfa = match query.len() {
+        0 ..= 4 => LEVDIST0.build_prefix_dfa(query),
+        5 ..= 8 => LEVDIST1.build_prefix_dfa(query),
+        _       => LEVDIST2.build_prefix_dfa(query),
+    };
+
+    DfaExt { query_len: query.len(), automaton: dfa }
+}
+
+pub trait AutomatonExt: Automaton {
+    fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
+    fn query_len(&self) -> usize;
+}
+
+impl AutomatonExt for DfaExt {
+    fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
+        self.automaton.eval(s)
+    }
+
+    fn query_len(&self) -> usize {
+        self.query_len
+    }
+}
--- a/raptor/src/levenshtein.rs
+++ b/raptor/src/levenshtein.rs
@@ -1,37 +0,0 @@
-use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
-
-pub struct LevBuilder {
-    automatons: [LevenshteinAutomatonBuilder; 3],
-}
-
-impl LevBuilder {
-    pub fn new() -> Self {
-        Self {
-            automatons: [
-                LevenshteinAutomatonBuilder::new(0, false),
-                LevenshteinAutomatonBuilder::new(1, false),
-                LevenshteinAutomatonBuilder::new(2, false),
-            ],
-        }
-    }
-
-    pub fn get_automaton(&self, query: &str) -> Levenshtein {
-        assert!(!query.is_empty());
-
-        let dfa = if query.len() <= 4 {
-            self.automatons[0].build_prefix_dfa(query)
-        } else if query.len() <= 8 {
-            self.automatons[1].build_prefix_dfa(query)
-        } else {
-            self.automatons[2].build_prefix_dfa(query)
-        };
-
-        Levenshtein { dfa, query_len: query.len() }
-    }
-}
-
-#[derive(Clone)]
-pub struct Levenshtein {
-    pub dfa: DFA,
-    pub query_len: usize,
-}
--- a/raptor/src/lib.rs
+++ b/raptor/src/lib.rs
@@ -1,24 +1,16 @@
-#![feature(nll)]
-
-extern crate fst;
-extern crate fnv;
-extern crate group_by;
-extern crate levenshtein_automata;
-extern crate byteorder;
-extern crate rocksdb;
+#[macro_use] extern crate lazy_static;

 pub mod rank;
 pub mod metadata;
-pub mod levenshtein;
+pub mod automaton;

 pub use self::metadata::{
    Metadata, MetadataBuilder,
-    StreamWithState, StreamWithStateBuilder,
-    UnionWithState, OpWithStateBuilder,
-    IndexedValuesWithState,
+    Stream, StreamBuilder,
+    Union, OpBuilder,
+    IndexedValues,
 };
-pub use self::rank::{RankedStream};
-pub use self::levenshtein::LevBuilder;
+pub use self::rank::RankedStream;

 pub type DocumentId = u64;

--- a/raptor/src/metadata.rs
+++ b/raptor/src/metadata.rs
@@ -9,7 +9,7 @@ use std::mem;
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use fst::{self, Map, MapBuilder, Automaton};
 use fst::raw::MmapReadOnly;
-use DocIndex;
+use crate::DocIndex;

 #[repr(C)]
 struct Range {
@@ -256,23 +256,23 @@ unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
    from_raw_parts(ptr, len)
 }

-pub struct OpWithStateBuilder<'m, 'v, U> {
-    inner: fst::map::OpWithStateBuilder<'m, U>,
+pub struct OpBuilder<'m, 'v> {
+    inner: fst::map::OpBuilder<'m>,
    indexes: &'v DocIndexes,
 }

-impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
+impl<'m, 'v> OpBuilder<'m, 'v> {
    pub fn new(indexes: &'v DocIndexes) -> Self {
        Self {
-            inner: fst::map::OpWithStateBuilder::new(),
+            inner: fst::map::OpBuilder::new(),
            indexes: indexes,
        }
    }

    pub fn add<I, S>(mut self, streamable: I) -> Self
    where
-        I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
-        S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
+        I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
+        S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
    {
        self.push(streamable);
        self
@@ -280,14 +280,14 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {

    pub fn push<I, S>(&mut self, streamable: I)
    where
-        I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
-        S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
+        I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
+        S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
    {
        self.inner.push(streamable);
    }

-    pub fn union(self) -> UnionWithState<'m, 'v, U> {
-        UnionWithState {
+    pub fn union(self) -> Union<'m, 'v> {
+        Union {
            inner: self.inner.union(),
            outs: Vec::new(),
            indexes: self.indexes,
@@ -296,23 +296,19 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
 }

 #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
-pub struct IndexedValuesWithState<'a, U> {
+pub struct IndexedValues<'a> {
    pub index: usize,
    pub values: &'a [DocIndex],
-    pub state: U,
 }

-pub struct UnionWithState<'m, 'v, U> {
-    inner: fst::map::UnionWithState<'m, U>,
-    outs: Vec<IndexedValuesWithState<'v, U>>,
+pub struct Union<'m, 'v> {
+    inner: fst::map::Union<'m>,
+    outs: Vec<IndexedValues<'v>>,
    indexes: &'v DocIndexes,
 }

-impl<'a, 'm, 'v, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, U>
-where
-    U: Clone,
-{
-    type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, U>]);
+impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> {
+    type Item = (&'a [u8], &'a [IndexedValues<'a>]);

    fn next(&'a mut self) -> Option<Self::Item> {
        match self.inner.next() {
@@ -322,8 +318,7 @@ where
                for ivalue in ivalues {
                    if let Some(values) = self.indexes.get(ivalue.value) {
                        let index = ivalue.index;
-                        let state = ivalue.state.clone();
-                        self.outs.push(IndexedValuesWithState { index, values, state })
+                        self.outs.push(IndexedValues { index, values })
                    }
                }
                Some((s, &self.outs))
@@ -333,44 +328,43 @@ where
    }
 }

-pub struct StreamWithStateBuilder<'m, 'v, A> {
-    inner: fst::map::StreamWithStateBuilder<'m, A>,
+pub struct StreamBuilder<'m, 'v, A> {
+    inner: fst::map::StreamBuilder<'m, A>,
    indexes: &'v DocIndexes,
 }

-impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, A>
+impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A>
 where
    A: Automaton,
    A::State: Clone,
 {
    type Item = <Self::Into as fst::Streamer<'a>>::Item;
-    type Into = StreamWithState<'m, 'v, A>;
+    type Into = Stream<'m, 'v, A>;

    fn into_stream(self) -> Self::Into {
-        StreamWithState {
+        Stream {
            inner: self.inner.into_stream(),
            indexes: self.indexes,
        }
    }
 }

-pub struct StreamWithState<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
-    inner: fst::map::StreamWithState<'m, A>,
+pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
+    inner: fst::map::Stream<'m, A>,
    indexes: &'v DocIndexes,
 }

-impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, A>
+impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A>
 where
    A: Automaton,
-    A::State: Clone,
 {
-    type Item = (&'a [u8], &'a [DocIndex], A::State);
+    type Item = (&'a [u8], &'a [DocIndex]);

    fn next(&'a mut self) -> Option<Self::Item> {
        match self.inner.next() {
-            Some((key, i, state)) => {
+            Some((key, i)) => {
                match self.indexes.get(i) {
-                    Some(values) => Some((key, values, state)),
+                    Some(values) => Some((key, values)),
                    None => None,
                }
            },
--- a/raptor/src/rank/exact.rs
+++ b/raptor/src/rank/exact.rs
@@ -1,7 +1,7 @@
 use std::cmp::Ordering;
-use Match;
-use rank::{match_query_index, Document};
 use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};

 #[inline]
 fn contains_exact(matches: &[Match]) -> bool {
--- a/raptor/src/rank/mod.rs
+++ b/raptor/src/rank/mod.rs
@@ -6,13 +6,14 @@ mod sum_of_words_position;
 mod exact;

 use std::cmp::Ordering;
+use std::rc::Rc;
 use std::{mem, vec};
 use fst;
 use fnv::FnvHashMap;
-use levenshtein::Levenshtein;
-use metadata::{DocIndexes, OpWithStateBuilder, UnionWithState};
-use {Match, DocumentId};
 use group_by::GroupByMut;
+use crate::automaton::{DfaExt, AutomatonExt};
+use crate::metadata::{DocIndexes, OpBuilder, Union};
+use crate::{Match, DocumentId};

 use self::{
    sum_of_typos::sum_of_typos,
@@ -85,11 +86,12 @@ fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize)
 pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>);

 impl<'m, 'v> RankedStream<'m, 'v> {
-    pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<Levenshtein>, limit: usize) -> Self {
-        let mut op = OpWithStateBuilder::new(indexes);
+    pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<DfaExt>, limit: usize) -> Self {
+        let mut op = OpBuilder::new(indexes);

-        for automaton in automatons.iter().map(|l| l.dfa.clone()) {
-            let stream = map.search(automaton).with_state();
+        let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
+        for automaton in automatons.iter().cloned() {
+            let stream = map.search(automaton);
            op.push(stream);
        }

@@ -114,8 +116,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {

 enum RankedStreamInner<'m, 'v> {
    Fed {
-        inner: UnionWithState<'m, 'v, u32>,
-        automatons: Vec<Levenshtein>,
+        inner: Union<'m, 'v>,
+        automatons: Vec<Rc<DfaExt>>,
        limit: usize,
        matches: FnvHashMap<DocumentId, Vec<Match>>,
    },
@@ -136,7 +138,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
                            for iv in indexed_values {

                                let automaton = &automatons[iv.index];
-                                let distance = automaton.dfa.distance(iv.state).to_u8();
+                                let distance = automaton.eval(string).to_u8();
+                                let same_length = string.len() == automaton.query_len();

                                for di in iv.values {
                                    let match_ = Match {
@@ -144,11 +147,11 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
                                        distance: distance,
                                        attribute: di.attribute,
                                        attribute_index: di.attribute_index,
-                                        is_exact: distance == 0 && string.len() == automaton.query_len,
+                                        is_exact: distance == 0 && same_length,
                                    };
                                    matches.entry(di.document)
-                                            .or_insert_with(Vec::new)
-                                            .push(match_);
+                                           .or_insert_with(Vec::new)
+                                           .push(match_);
                                }
                            }
                        },
--- a/raptor/src/rank/number_of_words.rs
+++ b/raptor/src/rank/number_of_words.rs
@@ -1,7 +1,7 @@
 use std::cmp::Ordering;
-use Match;
-use rank::{match_query_index, Document};
 use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};

 #[inline]
 fn number_of_query_words(matches: &[Match]) -> usize {
--- a/raptor/src/rank/sum_of_typos.rs
+++ b/raptor/src/rank/sum_of_typos.rs
@@ -1,7 +1,7 @@
 use std::cmp::Ordering;
-use Match;
-use rank::{match_query_index, Document};
 use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};

 #[inline]
 fn sum_matches_typos(matches: &[Match]) -> u8 {
--- a/raptor/src/rank/sum_of_words_attribute.rs
+++ b/raptor/src/rank/sum_of_words_attribute.rs
@@ -1,7 +1,7 @@
 use std::cmp::Ordering;
-use Match;
-use rank::{match_query_index, Document};
 use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};

 #[inline]
 fn sum_matches_attributes(matches: &[Match]) -> u8 {
--- a/raptor/src/rank/sum_of_words_position.rs
+++ b/raptor/src/rank/sum_of_words_position.rs
@@ -1,7 +1,7 @@
 use std::cmp::Ordering;
-use Match;
-use rank::{match_query_index, Document};
 use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};

 #[inline]
 fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
--- a/raptor/src/rank/words_proximity.rs
+++ b/raptor/src/rank/words_proximity.rs
@@ -1,7 +1,7 @@
 use std::cmp::{self, Ordering};
-use Match;
-use rank::{match_query_index, Document};
 use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};

 const MAX_DISTANCE: u32 = 8;