mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 03:40:00 +00:00
chore: Make the repo use examples and keep the library
This commit is contained in:
22
src/rank/exact.rs
Normal file
22
src/rank/exact.rs
Normal file
@ -0,0 +1,22 @@
|
||||
use std::cmp::Ordering;
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn contains_exact(matches: &[Match]) -> bool {
|
||||
matches.iter().any(|m| m.is_exact)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(matches: &[Match]) -> usize {
|
||||
GroupBy::new(matches, match_query_index).map(contains_exact).count()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn exact(lhs: &Document, rhs: &Document) -> Ordering {
|
||||
let lhs = number_exact_matches(&lhs.matches);
|
||||
let rhs = number_exact_matches(&rhs.matches);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
169
src/rank/mod.rs
Normal file
169
src/rank/mod.rs
Normal file
@ -0,0 +1,169 @@
|
||||
mod sum_of_typos;
|
||||
mod number_of_words;
|
||||
mod words_proximity;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod exact;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::rc::Rc;
|
||||
use std::{mem, vec};
|
||||
use fst::Streamer;
|
||||
use fnv::FnvHashMap;
|
||||
use group_by::GroupByMut;
|
||||
use crate::automaton::{DfaExt, AutomatonExt};
|
||||
use crate::metadata::Metadata;
|
||||
use crate::metadata::ops::{OpBuilder, Union};
|
||||
use crate::{Match, DocumentId};
|
||||
|
||||
use self::{
|
||||
sum_of_typos::sum_of_typos,
|
||||
number_of_words::number_of_words,
|
||||
words_proximity::words_proximity,
|
||||
sum_of_words_attribute::sum_of_words_attribute,
|
||||
sum_of_words_position::sum_of_words_position,
|
||||
exact::exact,
|
||||
};
|
||||
|
||||
#[inline]
|
||||
fn match_query_index(a: &Match, b: &Match) -> bool {
|
||||
a.query_index == b.query_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Document {
|
||||
pub document_id: DocumentId,
|
||||
pub matches: Vec<Match>,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
pub fn new(doc: DocumentId, match_: Match) -> Self {
|
||||
Self::from_sorted_matches(doc, vec![match_])
|
||||
}
|
||||
|
||||
pub fn from_sorted_matches(doc: DocumentId, matches: Vec<Match>) -> Self {
|
||||
Self {
|
||||
document_id: doc,
|
||||
matches: matches,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize) -> vec::IntoIter<Document> {
|
||||
let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| {
|
||||
matches.sort_unstable();
|
||||
Document::from_sorted_matches(id, matches)
|
||||
}).collect();
|
||||
|
||||
let sorts = &[
|
||||
sum_of_typos,
|
||||
number_of_words,
|
||||
words_proximity,
|
||||
sum_of_words_attribute,
|
||||
sum_of_words_position,
|
||||
exact,
|
||||
];
|
||||
|
||||
let mut groups = vec![documents.as_mut_slice()];
|
||||
|
||||
for sort in sorts {
|
||||
let temp = mem::replace(&mut groups, Vec::new());
|
||||
let mut computed = 0;
|
||||
|
||||
'grp: for group in temp {
|
||||
group.sort_unstable_by(sort);
|
||||
for group in GroupByMut::new(group, |a, b| sort(a, b) == Ordering::Equal) {
|
||||
computed += group.len();
|
||||
groups.push(group);
|
||||
if computed >= limit { break 'grp }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
documents.truncate(limit);
|
||||
documents.into_iter()
|
||||
}
|
||||
|
||||
pub struct RankedStream<'m>(RankedStreamInner<'m>);
|
||||
|
||||
impl<'m> RankedStream<'m> {
|
||||
pub fn new(metadata: &'m Metadata, automatons: Vec<DfaExt>, limit: usize) -> Self {
|
||||
let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
|
||||
let mut builder = OpBuilder::with_automatons(automatons.clone());
|
||||
builder.push(metadata);
|
||||
|
||||
let inner = RankedStreamInner::Fed {
|
||||
inner: builder.union(),
|
||||
automatons: automatons,
|
||||
limit: limit,
|
||||
matches: FnvHashMap::default(),
|
||||
};
|
||||
|
||||
RankedStream(inner)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> fst::Streamer<'a> for RankedStream<'m> {
|
||||
type Item = Document;
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
self.0.next()
|
||||
}
|
||||
}
|
||||
|
||||
enum RankedStreamInner<'m> {
|
||||
Fed {
|
||||
inner: Union<'m>,
|
||||
automatons: Vec<Rc<DfaExt>>,
|
||||
limit: usize,
|
||||
matches: FnvHashMap<DocumentId, Vec<Match>>,
|
||||
},
|
||||
Pours {
|
||||
inner: vec::IntoIter<Document>,
|
||||
},
|
||||
}
|
||||
|
||||
impl<'m, 'a> fst::Streamer<'a> for RankedStreamInner<'m> {
|
||||
type Item = Document;
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self {
|
||||
RankedStreamInner::Fed { inner, automatons, limit, matches } => {
|
||||
match inner.next() {
|
||||
Some((string, indexed_values)) => {
|
||||
for iv in indexed_values {
|
||||
|
||||
let automaton = &automatons[iv.index];
|
||||
let distance = automaton.eval(string).to_u8();
|
||||
let same_length = string.len() == automaton.query_len();
|
||||
|
||||
for di in iv.doc_indexes.as_slice() {
|
||||
let match_ = Match {
|
||||
query_index: iv.index as u32,
|
||||
distance: distance,
|
||||
attribute: di.attribute,
|
||||
attribute_index: di.attribute_index,
|
||||
is_exact: distance == 0 && same_length,
|
||||
};
|
||||
matches.entry(di.document)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(match_);
|
||||
}
|
||||
}
|
||||
},
|
||||
None => {
|
||||
let matches = mem::replace(matches, FnvHashMap::default());
|
||||
*self = RankedStreamInner::Pours {
|
||||
inner: matches_into_iter(matches, *limit).into_iter()
|
||||
};
|
||||
},
|
||||
}
|
||||
},
|
||||
RankedStreamInner::Pours { inner } => {
|
||||
return inner.next()
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
17
src/rank/number_of_words.rs
Normal file
17
src/rank/number_of_words.rs
Normal file
@ -0,0 +1,17 @@
|
||||
use std::cmp::Ordering;
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
||||
GroupBy::new(matches, match_query_index).count()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn number_of_words(lhs: &Document, rhs: &Document) -> Ordering {
|
||||
let lhs = number_of_query_words(&lhs.matches);
|
||||
let rhs = number_of_query_words(&rhs.matches);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
123
src/rank/sum_of_typos.rs
Normal file
123
src/rank/sum_of_typos.rs
Normal file
@ -0,0 +1,123 @@
|
||||
use std::cmp::Ordering;
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(matches: &[Match]) -> i8 {
|
||||
let mut sum_typos = 0;
|
||||
let mut number_words = 0;
|
||||
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
for group in GroupBy::new(matches, match_query_index) {
|
||||
sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
|
||||
number_words += 1;
|
||||
}
|
||||
|
||||
sum_typos - number_words
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn sum_of_typos(lhs: &Document, rhs: &Document) -> Ordering {
|
||||
let lhs = sum_matches_typos(&lhs.matches);
|
||||
let rhs = sum_matches_typos(&rhs.matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "Geox CEO"
|
||||
//
|
||||
// doc0: "Geox SpA: CEO and Executive"
|
||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||
#[test]
|
||||
fn one_typo_reference() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
document_id: 0,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
document_id: 1,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchette"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn no_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
document_id: 0,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
document_id: 1,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchztte"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn one_typo() {
|
||||
let doc0 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
document_id: 0,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let matches = vec![
|
||||
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
|
||||
];
|
||||
Document {
|
||||
document_id: 1,
|
||||
matches: matches,
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Equal);
|
||||
}
|
||||
}
|
21
src/rank/sum_of_words_attribute.rs
Normal file
21
src/rank/sum_of_words_attribute.rs
Normal file
@ -0,0 +1,21 @@
|
||||
use std::cmp::Ordering;
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(matches: &[Match]) -> u8 {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| unsafe {
|
||||
group.get_unchecked(0).attribute
|
||||
}).sum()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn sum_of_words_attribute(lhs: &Document, rhs: &Document) -> Ordering {
|
||||
let lhs = sum_matches_attributes(&lhs.matches);
|
||||
let rhs = sum_matches_attributes(&rhs.matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
21
src/rank/sum_of_words_position.rs
Normal file
21
src/rank/sum_of_words_position.rs
Normal file
@ -0,0 +1,21 @@
|
||||
use std::cmp::Ordering;
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
|
||||
// note that GroupBy will never return an empty group
|
||||
// so we can do this assumption safely
|
||||
GroupBy::new(matches, match_query_index).map(|group| unsafe {
|
||||
group.get_unchecked(0).attribute_index
|
||||
}).sum()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn sum_of_words_position(lhs: &Document, rhs: &Document) -> Ordering {
|
||||
let lhs = sum_matches_attribute_index(&lhs.matches);
|
||||
let rhs = sum_matches_attribute_index(&rhs.matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
104
src/rank/words_proximity.rs
Normal file
104
src/rank/words_proximity.rs
Normal file
@ -0,0 +1,104 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
const MAX_DISTANCE: u32 = 8;
|
||||
|
||||
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
|
||||
if lhs.attribute != rhs.attribute { return MAX_DISTANCE }
|
||||
index_proximity(lhs.attribute_index, rhs.attribute_index)
|
||||
}
|
||||
|
||||
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
|
||||
let mut min_prox = u32::max_value();
|
||||
for a in lhs {
|
||||
for b in rhs {
|
||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||
}
|
||||
}
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(matches: &[Match]) -> u32 {
|
||||
let mut proximity = 0;
|
||||
let mut iter = GroupBy::new(matches, match_query_index);
|
||||
|
||||
// iterate over groups by windows of size 2
|
||||
let mut last = iter.next();
|
||||
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
||||
proximity += min_proximity(lhs, rhs);
|
||||
last = Some(rhs);
|
||||
}
|
||||
|
||||
proximity
|
||||
}
|
||||
|
||||
pub fn words_proximity(lhs: &Document, rhs: &Document) -> Ordering {
|
||||
matches_proximity(&lhs.matches).cmp(&matches_proximity(&rhs.matches))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
|
||||
// "soup" "of the" "the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 0 }
|
||||
// { id: 2, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 2, attr_index: 0 }
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() },
|
||||
];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(matches_proximity(matches), 17);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_different_attributes() {
|
||||
|
||||
// "soup day" "soup of the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 0, attr: 1, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 1, attr_index: 2 }
|
||||
// { id: 3, attr: 0, attr_index: 1 }
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let matches = &[
|
||||
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() },
|
||||
Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() },
|
||||
Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() },
|
||||
];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(matches_proximity(matches), 3);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user