feat: Simplify the levenshtein construction

This commit is contained in:
Clément Renault
2018-09-09 11:13:58 +02:00
parent f0f5fc9891
commit 31e04f0120
17 changed files with 156 additions and 141 deletions

View File

@ -1,4 +1,7 @@
cargo-features = ["edition"]
[package]
edition = "2018"
name = "raptor"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
@ -6,14 +9,15 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
byteorder = "1.2"
fnv = "1.0"
lazy_static = "1.1"
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "always-match-clone"
branch = "automaton-for-deref"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "custom-fst"
branch = "new-custom-fst"
features = ["fst_automaton"]
[dependencies.rocksdb]

50
raptor/src/automaton.rs Normal file
View File

@ -0,0 +1,50 @@
use std::ops::Deref;
use fst::Automaton;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA, Distance,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
pub struct DfaExt {
query_len: usize,
automaton: DFA,
}
impl Deref for DfaExt {
type Target = DFA;
fn deref(&self) -> &Self::Target {
&self.automaton
}
}
pub fn build(query: &str) -> DfaExt {
let dfa = match query.len() {
0 ..= 4 => LEVDIST0.build_prefix_dfa(query),
5 ..= 8 => LEVDIST1.build_prefix_dfa(query),
_ => LEVDIST2.build_prefix_dfa(query),
};
DfaExt { query_len: query.len(), automaton: dfa }
}
pub trait AutomatonExt: Automaton {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
fn query_len(&self) -> usize;
}
impl AutomatonExt for DfaExt {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
self.automaton.eval(s)
}
fn query_len(&self) -> usize {
self.query_len
}
}

View File

@ -1,37 +0,0 @@
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
pub struct LevBuilder {
automatons: [LevenshteinAutomatonBuilder; 3],
}
impl LevBuilder {
pub fn new() -> Self {
Self {
automatons: [
LevenshteinAutomatonBuilder::new(0, false),
LevenshteinAutomatonBuilder::new(1, false),
LevenshteinAutomatonBuilder::new(2, false),
],
}
}
pub fn get_automaton(&self, query: &str) -> Levenshtein {
assert!(!query.is_empty());
let dfa = if query.len() <= 4 {
self.automatons[0].build_prefix_dfa(query)
} else if query.len() <= 8 {
self.automatons[1].build_prefix_dfa(query)
} else {
self.automatons[2].build_prefix_dfa(query)
};
Levenshtein { dfa, query_len: query.len() }
}
}
#[derive(Clone)]
pub struct Levenshtein {
pub dfa: DFA,
pub query_len: usize,
}

View File

@ -1,24 +1,16 @@
#![feature(nll)]
extern crate fst;
extern crate fnv;
extern crate group_by;
extern crate levenshtein_automata;
extern crate byteorder;
extern crate rocksdb;
#[macro_use] extern crate lazy_static;
pub mod rank;
pub mod metadata;
pub mod levenshtein;
pub mod automaton;
pub use self::metadata::{
Metadata, MetadataBuilder,
StreamWithState, StreamWithStateBuilder,
UnionWithState, OpWithStateBuilder,
IndexedValuesWithState,
Stream, StreamBuilder,
Union, OpBuilder,
IndexedValues,
};
pub use self::rank::{RankedStream};
pub use self::levenshtein::LevBuilder;
pub use self::rank::RankedStream;
pub type DocumentId = u64;

View File

@ -9,7 +9,7 @@ use std::mem;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::{self, Map, MapBuilder, Automaton};
use fst::raw::MmapReadOnly;
use DocIndex;
use crate::DocIndex;
#[repr(C)]
struct Range {
@ -256,23 +256,23 @@ unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
from_raw_parts(ptr, len)
}
pub struct OpWithStateBuilder<'m, 'v, U> {
inner: fst::map::OpWithStateBuilder<'m, U>,
pub struct OpBuilder<'m, 'v> {
inner: fst::map::OpBuilder<'m>,
indexes: &'v DocIndexes,
}
impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
impl<'m, 'v> OpBuilder<'m, 'v> {
pub fn new(indexes: &'v DocIndexes) -> Self {
Self {
inner: fst::map::OpWithStateBuilder::new(),
inner: fst::map::OpBuilder::new(),
indexes: indexes,
}
}
pub fn add<I, S>(mut self, streamable: I) -> Self
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
{
self.push(streamable);
self
@ -280,14 +280,14 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
pub fn push<I, S>(&mut self, streamable: I)
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
{
self.inner.push(streamable);
}
pub fn union(self) -> UnionWithState<'m, 'v, U> {
UnionWithState {
pub fn union(self) -> Union<'m, 'v> {
Union {
inner: self.inner.union(),
outs: Vec::new(),
indexes: self.indexes,
@ -296,23 +296,19 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
}
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct IndexedValuesWithState<'a, U> {
pub struct IndexedValues<'a> {
pub index: usize,
pub values: &'a [DocIndex],
pub state: U,
}
pub struct UnionWithState<'m, 'v, U> {
inner: fst::map::UnionWithState<'m, U>,
outs: Vec<IndexedValuesWithState<'v, U>>,
pub struct Union<'m, 'v> {
inner: fst::map::Union<'m>,
outs: Vec<IndexedValues<'v>>,
indexes: &'v DocIndexes,
}
impl<'a, 'm, 'v, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, U>
where
U: Clone,
{
type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, U>]);
impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> {
type Item = (&'a [u8], &'a [IndexedValues<'a>]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.inner.next() {
@ -322,8 +318,7 @@ where
for ivalue in ivalues {
if let Some(values) = self.indexes.get(ivalue.value) {
let index = ivalue.index;
let state = ivalue.state.clone();
self.outs.push(IndexedValuesWithState { index, values, state })
self.outs.push(IndexedValues { index, values })
}
}
Some((s, &self.outs))
@ -333,44 +328,43 @@ where
}
}
pub struct StreamWithStateBuilder<'m, 'v, A> {
inner: fst::map::StreamWithStateBuilder<'m, A>,
pub struct StreamBuilder<'m, 'v, A> {
inner: fst::map::StreamBuilder<'m, A>,
indexes: &'v DocIndexes,
}
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, A>
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A>
where
A: Automaton,
A::State: Clone,
{
type Item = <Self::Into as fst::Streamer<'a>>::Item;
type Into = StreamWithState<'m, 'v, A>;
type Into = Stream<'m, 'v, A>;
fn into_stream(self) -> Self::Into {
StreamWithState {
Stream {
inner: self.inner.into_stream(),
indexes: self.indexes,
}
}
}
pub struct StreamWithState<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
inner: fst::map::StreamWithState<'m, A>,
pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
inner: fst::map::Stream<'m, A>,
indexes: &'v DocIndexes,
}
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, A>
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A>
where
A: Automaton,
A::State: Clone,
{
type Item = (&'a [u8], &'a [DocIndex], A::State);
type Item = (&'a [u8], &'a [DocIndex]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((key, i, state)) => {
Some((key, i)) => {
match self.indexes.get(i) {
Some(values) => Some((key, values, state)),
Some(values) => Some((key, values)),
None => None,
}
},

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use Match;
use rank::{match_query_index, Document};
use group_by::GroupBy;
use crate::Match;
use crate::rank::{match_query_index, Document};
#[inline]
fn contains_exact(matches: &[Match]) -> bool {

View File

@ -6,13 +6,14 @@ mod sum_of_words_position;
mod exact;
use std::cmp::Ordering;
use std::rc::Rc;
use std::{mem, vec};
use fst;
use fnv::FnvHashMap;
use levenshtein::Levenshtein;
use metadata::{DocIndexes, OpWithStateBuilder, UnionWithState};
use {Match, DocumentId};
use group_by::GroupByMut;
use crate::automaton::{DfaExt, AutomatonExt};
use crate::metadata::{DocIndexes, OpBuilder, Union};
use crate::{Match, DocumentId};
use self::{
sum_of_typos::sum_of_typos,
@ -85,11 +86,12 @@ fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize)
pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>);
impl<'m, 'v> RankedStream<'m, 'v> {
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<Levenshtein>, limit: usize) -> Self {
let mut op = OpWithStateBuilder::new(indexes);
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<DfaExt>, limit: usize) -> Self {
let mut op = OpBuilder::new(indexes);
for automaton in automatons.iter().map(|l| l.dfa.clone()) {
let stream = map.search(automaton).with_state();
let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
for automaton in automatons.iter().cloned() {
let stream = map.search(automaton);
op.push(stream);
}
@ -114,8 +116,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {
enum RankedStreamInner<'m, 'v> {
Fed {
inner: UnionWithState<'m, 'v, u32>,
automatons: Vec<Levenshtein>,
inner: Union<'m, 'v>,
automatons: Vec<Rc<DfaExt>>,
limit: usize,
matches: FnvHashMap<DocumentId, Vec<Match>>,
},
@ -136,7 +138,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
for iv in indexed_values {
let automaton = &automatons[iv.index];
let distance = automaton.dfa.distance(iv.state).to_u8();
let distance = automaton.eval(string).to_u8();
let same_length = string.len() == automaton.query_len();
for di in iv.values {
let match_ = Match {
@ -144,11 +147,11 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
distance: distance,
attribute: di.attribute,
attribute_index: di.attribute_index,
is_exact: distance == 0 && string.len() == automaton.query_len,
is_exact: distance == 0 && same_length,
};
matches.entry(di.document)
.or_insert_with(Vec::new)
.push(match_);
.or_insert_with(Vec::new)
.push(match_);
}
}
},

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use Match;
use rank::{match_query_index, Document};
use group_by::GroupBy;
use crate::Match;
use crate::rank::{match_query_index, Document};
#[inline]
fn number_of_query_words(matches: &[Match]) -> usize {

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use Match;
use rank::{match_query_index, Document};
use group_by::GroupBy;
use crate::Match;
use crate::rank::{match_query_index, Document};
#[inline]
fn sum_matches_typos(matches: &[Match]) -> u8 {

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use Match;
use rank::{match_query_index, Document};
use group_by::GroupBy;
use crate::Match;
use crate::rank::{match_query_index, Document};
#[inline]
fn sum_matches_attributes(matches: &[Match]) -> u8 {

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use Match;
use rank::{match_query_index, Document};
use group_by::GroupBy;
use crate::Match;
use crate::rank::{match_query_index, Document};
#[inline]
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {

View File

@ -1,7 +1,7 @@
use std::cmp::{self, Ordering};
use Match;
use rank::{match_query_index, Document};
use group_by::GroupBy;
use crate::Match;
use crate::rank::{match_query_index, Document};
const MAX_DISTANCE: u32 = 8;