squash-me: Improve the higlighted area

This commit is contained in:
Clément Renault
2019-12-05 14:35:38 +01:00
parent 8a17a8d949
commit e7654ffa1e
2 changed files with 61 additions and 29 deletions

View File

@@ -17,6 +17,7 @@ use meilisearch_types::{DocIndex, Highlight};
use sdset::Set;
use slice_group_by::{GroupBy, GroupByMut};
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::automaton::{build_dfa, build_prefix_dfa};
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult};
@@ -35,7 +36,7 @@ pub fn bucket_sort<'c>(
let before_postings_lists_fetching = Instant::now();
mk_arena!(arena);
let mut bare_matches = fetch_matches(reader, automatons, &mut arena, main_store, postings_lists_store)?;
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
debug!("bare matches ({}) retrieved in {:.02?}",
bare_matches.len(),
before_postings_lists_fetching.elapsed(),
@@ -45,9 +46,6 @@ pub fn bucket_sort<'c>(
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
dbg!(mem::size_of::<BareMatch>());
dbg!(mem::size_of::<SimpleMatch>());
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
@@ -58,6 +56,9 @@ pub fn bucket_sort<'c>(
before_raw_documents_building.elapsed(),
);
dbg!(mem::size_of::<BareMatch>());
dbg!(mem::size_of::<SimpleMatch>());
let mut groups = vec![raw_documents.as_mut_slice()];
let criteria = [
@@ -67,6 +68,7 @@ pub fn bucket_sort<'c>(
Box::new(Attribute),
Box::new(WordsPosition),
Box::new(Exact),
Box::new(StableDocId),
];
'criteria: for criterion in &criteria {
@@ -74,16 +76,6 @@ pub fn bucket_sort<'c>(
let mut documents_seen = 0;
for mut group in tmp_groups {
// if criterion.name() == "attribute" {
// for document in group.iter() {
// println!("--- {} - {}",
// document.raw_matches.len(),
// document.raw_matches.iter().map(|x| arena[x.postings_list].len()).sum::<usize>(),
// );
// }
// }
let before_criterion_preparation = Instant::now();
criterion.prepare(&mut group, &mut arena);
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
@@ -111,13 +103,18 @@ pub fn bucket_sort<'c>(
let iter = iter.map(|d| {
let highlights = d.raw_matches.iter().flat_map(|sm| {
let postings_list = &arena[sm.postings_list];
postings_list.iter().filter(|m| m.document_id == d.raw_matches[0].document_id).map(|m| {
Highlight { attribute: m.attribute, char_index: m.char_index, char_length: m.char_length }
let input = postings_list.input();
let query = &automatons[sm.query_index as usize].query;
postings_list.iter().map(move |m| {
let covered_area = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 }
})
}).collect();
// let highlights = Default::default();
Document {
id: d.raw_matches[0].document_id,
highlights,
@@ -153,25 +150,31 @@ pub struct SimpleMatch {
#[derive(Clone)]
pub struct PostingsListView<'txn> {
data: Rc<Cow<'txn, Set<DocIndex>>>,
input: Rc<[u8]>,
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
offset: usize,
len: usize,
}
impl<'txn> PostingsListView<'txn> {
pub fn new(data: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = data.len();
PostingsListView { data, offset: 0, len }
pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = postings_list.len();
PostingsListView { input, postings_list, offset: 0, len }
}
pub fn len(&self) -> usize {
self.len
}
pub fn input(&self) -> &[u8] {
&self.input
}
pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> {
assert!(offset + len <= self.len);
PostingsListView {
data: self.data.clone(),
input: self.input.clone(),
postings_list: self.postings_list.clone(),
offset: self.offset + offset,
len: len,
}
@@ -180,7 +183,7 @@ impl<'txn> PostingsListView<'txn> {
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
fn as_ref(&self) -> &Set<DocIndex> {
Set::new_unchecked(&self.data[self.offset..self.offset + self.len])
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
}
}
@@ -188,13 +191,13 @@ impl Deref for PostingsListView<'_> {
type Target = Set<DocIndex>;
fn deref(&self) -> &Set<DocIndex> {
Set::new_unchecked(&self.data[self.offset..self.offset + self.len])
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
}
}
fn fetch_matches<'txn, 'tag>(
reader: &'txn heed::RoTxn<MainT>,
automatons: Vec<QueryWordAutomaton>,
automatons: &[QueryWordAutomaton],
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
@@ -213,7 +216,7 @@ fn fetch_matches<'txn, 'tag>(
let mut stream_next_time = Duration::default();
let mut postings_lists_fetching_time = Duration::default();
for (query_index, automaton) in automatons.into_iter().enumerate() {
for (query_index, automaton) in automatons.iter().enumerate() {
let before_dfa = Instant::now();
let dfa = automaton.dfa();
let QueryWordAutomaton { query, is_exact, is_prefix } = automaton;
@@ -237,12 +240,14 @@ fn fetch_matches<'txn, 'tag>(
number_of_words += 1;
let distance = dfa.eval(input).to_u8();
let is_exact = is_exact && distance == 0 && input.len() == query.len();
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
let before_postings_lists_fetching = Instant::now();
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
let postings_list_view = PostingsListView::new(Rc::new(postings_list));
let input = Rc::from(input);
let postings_list = Rc::new(postings_list);
let postings_list_view = PostingsListView::new(input, postings_list);
let mut offset = 0;
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {

View File

@@ -331,3 +331,30 @@ impl Criterion for Exact {
lhs.cmp(&rhs).reverse()
}
}
pub struct StableDocId;
impl Criterion for StableDocId {
fn name(&self) -> &str { "stable document id" }
fn prepare(
&self,
documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena,
) {
// ...
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
let lhs = &lhs.raw_matches[0].document_id;
let rhs = &rhs.raw_matches[0].document_id;
lhs.cmp(rhs)
}
}