mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	wip: Make the new query tree work with the criteria
This commit is contained in:
		| @@ -1,5 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashSet; | ||||
| use std::convert::TryFrom; | ||||
| use std::mem; | ||||
| use std::ops::Deref; | ||||
| use std::ops::Range; | ||||
| @@ -10,7 +11,6 @@ use std::{cmp, fmt}; | ||||
|  | ||||
| use compact_arena::{SmallArena, Idx32, mk_arena}; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use hashbrown::HashMap; | ||||
| use levenshtein_automata::DFA; | ||||
| use log::debug; | ||||
| use meilisearch_tokenizer::{is_cjk, split_query_string}; | ||||
| @@ -49,36 +49,6 @@ pub fn bucket_sort<'c, FI>( | ||||
| where | ||||
|     FI: Fn(DocumentId) -> bool, | ||||
| { | ||||
|     let words_set = match unsafe { main_store.static_words_fst(reader)? } { | ||||
|         Some(words) => words, | ||||
|         None => return Ok(Vec::new()), | ||||
|     }; | ||||
|  | ||||
|     let context = QTContext { | ||||
|         words_set, | ||||
|         synonyms: synonyms_store, | ||||
|         postings_lists: postings_lists_store, | ||||
|         prefix_postings_lists: prefix_postings_lists_cache_store, | ||||
|     }; | ||||
|  | ||||
|     let (operation, mapping) = create_query_tree(reader, &context, query).unwrap(); | ||||
|     println!("{:?}", operation); | ||||
|     println!("{:?}", mapping); | ||||
|  | ||||
|     let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); | ||||
|     println!("found {} documents", docids.len()); | ||||
|     println!("number of postings {:?}", queries.len()); | ||||
|  | ||||
|     let before = Instant::now(); | ||||
|     for ((query, input), matches) in queries { | ||||
|         // TODO optimize the filter by skipping docids that have already been seen | ||||
|         for matches in matches.linear_group_by_key(|m| m.document_id).filter(|ms| docids.contains(&ms[0].document_id)) { | ||||
|             // ... | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     println!("matches cleaned in {:.02?}", before.elapsed()); | ||||
|  | ||||
|     // We delegate the filter work to the distinct query builder, | ||||
|     // specifying a distinct rule that has no effect. | ||||
|     if filter.is_some() { | ||||
| @@ -102,47 +72,58 @@ where | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     let before_bucket_sort = Instant::now(); | ||||
|     let words_set = match unsafe { main_store.static_words_fst(reader)? } { | ||||
|         Some(words) => words, | ||||
|         None => return Ok(Vec::new()), | ||||
|     }; | ||||
|  | ||||
|     let (mut automatons, mut query_enhancer) = | ||||
|         construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; | ||||
|     let context = QTContext { | ||||
|         words_set, | ||||
|         synonyms: synonyms_store, | ||||
|         postings_lists: postings_lists_store, | ||||
|         prefix_postings_lists: prefix_postings_lists_cache_store, | ||||
|     }; | ||||
|  | ||||
|     if let [automaton] = &automatons[..] { | ||||
|         if automaton.is_prefix && automaton.query.len() <= 4 { | ||||
|             let mut prefix = [0; 4]; | ||||
|             let len = cmp::min(4, automaton.query.len()); | ||||
|             prefix[..len].copy_from_slice(&automaton.query.as_bytes()[..len]); | ||||
|     let (operation, mapping) = create_query_tree(reader, &context, query).unwrap(); | ||||
|     println!("{:?}", operation); | ||||
|     println!("{:?}", mapping); | ||||
|  | ||||
|             let mut documents = Vec::new(); | ||||
|             let iter = prefix_documents_cache_store.prefix_documents(reader, prefix)?; | ||||
|             for result in iter.skip(range.start).take(range.len()) { | ||||
|                 let (docid, highlights) = result?; | ||||
|                 documents.push(Document::from_highlights(docid, &highlights)); | ||||
|     let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); | ||||
|     println!("found {} documents", docids.len()); | ||||
|     println!("number of postings {:?}", queries.len()); | ||||
|  | ||||
|     let before = Instant::now(); | ||||
|  | ||||
|     let mut bare_matches = Vec::new(); | ||||
|     mk_arena!(arena); | ||||
|     for ((query, input), matches) in queries { | ||||
|  | ||||
|         let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches)); | ||||
|         // TODO optimize the filter by skipping docids that have already been seen | ||||
|         let mut offset = 0; | ||||
|         for matches in postings_list_view.linear_group_by_key(|m| m.document_id) { | ||||
|             let document_id = matches[0].document_id; | ||||
|             if docids.contains(&document_id) { | ||||
|                 let range = postings_list_view.range(offset, matches.len()); | ||||
|                 let posting_list_index = arena.add(range); | ||||
|                 let bare_match = BareMatch { | ||||
|                     document_id, | ||||
|                     query_index: u16::try_from(query.id).unwrap(), | ||||
|                     distance: 0, | ||||
|                     is_exact: true, // TODO where can I find this info? | ||||
|                     postings_list: posting_list_index, | ||||
|                 }; | ||||
|  | ||||
|                 bare_matches.push(bare_match); | ||||
|             } | ||||
|  | ||||
|             if !documents.is_empty() { | ||||
|                 return Ok(documents); | ||||
|             } | ||||
|             offset += matches.len(); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     debug!("{:?}", query_enhancer); | ||||
|     println!("matches cleaned in {:.02?}", before.elapsed()); | ||||
|  | ||||
|     let before_postings_lists_fetching = Instant::now(); | ||||
|     mk_arena!(arena); | ||||
|     let mut bare_matches = | ||||
|         fetch_matches( | ||||
|             reader, | ||||
|             &automatons, | ||||
|             &mut arena, | ||||
|             main_store, | ||||
|             postings_lists_store, | ||||
|             prefix_postings_lists_cache_store, | ||||
|         )?; | ||||
|     debug!("bare matches ({}) retrieved in {:.02?}", | ||||
|         bare_matches.len(), | ||||
|         before_postings_lists_fetching.elapsed(), | ||||
|     ); | ||||
|     let before_bucket_sort = Instant::now(); | ||||
|  | ||||
|     let before_raw_documents_presort = Instant::now(); | ||||
|     bare_matches.sort_unstable_by_key(|sm| sm.document_id); | ||||
| @@ -152,14 +133,11 @@ where | ||||
|     let mut prefiltered_documents = 0; | ||||
|     let mut raw_documents = Vec::new(); | ||||
|     for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { | ||||
|         prefiltered_documents += 1; | ||||
|         if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { | ||||
|             raw_documents.push(raw_document); | ||||
|         } | ||||
|         let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref()); | ||||
|         raw_documents.push(raw_document); | ||||
|     } | ||||
|     debug!("creating {} (original {}) candidates documents took {:.02?}", | ||||
|     debug!("creating {} candidates documents took {:.02?}", | ||||
|         raw_documents.len(), | ||||
|         prefiltered_documents, | ||||
|         before_raw_documents_building.elapsed(), | ||||
|     ); | ||||
|  | ||||
| @@ -178,8 +156,7 @@ where | ||||
|             let ctx = ContextMut { | ||||
|                 reader, | ||||
|                 postings_lists: &mut arena, | ||||
|                 query_enhancer: &mut query_enhancer, | ||||
|                 automatons: &mut automatons, | ||||
|                 query_mapping: &mapping, | ||||
|                 documents_fields_counts_store, | ||||
|             }; | ||||
|  | ||||
| @@ -188,8 +165,7 @@ where | ||||
|  | ||||
|             let ctx = Context { | ||||
|                 postings_lists: &arena, | ||||
|                 query_enhancer: &query_enhancer, | ||||
|                 automatons: &automatons, | ||||
|                 query_mapping: &mapping, | ||||
|             }; | ||||
|  | ||||
|             let must_count = criterion.name() == "proximity"; | ||||
| @@ -223,7 +199,7 @@ where | ||||
|     debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed)); | ||||
|  | ||||
|     let iter = raw_documents.into_iter().skip(range.start).take(range.len()); | ||||
|     let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref())); | ||||
|     let iter = iter.map(|rd| Document::from_raw(rd, &arena, searchable_attrs.as_ref())); | ||||
|     let documents = iter.collect(); | ||||
|  | ||||
|     debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed()); | ||||
| @@ -251,163 +227,7 @@ where | ||||
|     FI: Fn(DocumentId) -> bool, | ||||
|     FD: Fn(DocumentId) -> Option<u64>, | ||||
| { | ||||
|     let (mut automatons, mut query_enhancer) = | ||||
|         construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; | ||||
|  | ||||
|     let before_postings_lists_fetching = Instant::now(); | ||||
|     mk_arena!(arena); | ||||
|     let mut bare_matches = fetch_matches( | ||||
|         reader, | ||||
|         &automatons, | ||||
|         &mut arena, | ||||
|         main_store, | ||||
|         postings_lists_store, | ||||
|         prefix_postings_lists_cache_store, | ||||
|     )?; | ||||
|     debug!("bare matches ({}) retrieved in {:.02?}", | ||||
|         bare_matches.len(), | ||||
|         before_postings_lists_fetching.elapsed(), | ||||
|     ); | ||||
|  | ||||
|     let before_raw_documents_presort = Instant::now(); | ||||
|     bare_matches.sort_unstable_by_key(|sm| sm.document_id); | ||||
|     debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); | ||||
|  | ||||
|     let before_raw_documents_building = Instant::now(); | ||||
|     let mut prefiltered_documents = 0; | ||||
|     let mut raw_documents = Vec::new(); | ||||
|     for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { | ||||
|         prefiltered_documents += 1; | ||||
|         if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { | ||||
|             raw_documents.push(raw_document); | ||||
|         } | ||||
|     } | ||||
|     debug!("creating {} (original {}) candidates documents took {:.02?}", | ||||
|         raw_documents.len(), | ||||
|         prefiltered_documents, | ||||
|         before_raw_documents_building.elapsed(), | ||||
|     ); | ||||
|  | ||||
|     let mut groups = vec![raw_documents.as_mut_slice()]; | ||||
|     let mut key_cache = HashMap::new(); | ||||
|  | ||||
|     let mut filter_map = HashMap::new(); | ||||
|     // these two variables informs on the current distinct map and | ||||
|     // on the raw offset of the start of the group where the | ||||
|     // range.start bound is located according to the distinct function | ||||
|     let mut distinct_map = DistinctMap::new(distinct_size); | ||||
|     let mut distinct_raw_offset = 0; | ||||
|  | ||||
|     'criteria: for criterion in criteria.as_ref() { | ||||
|         let tmp_groups = mem::replace(&mut groups, Vec::new()); | ||||
|         let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); | ||||
|         let mut documents_seen = 0; | ||||
|  | ||||
|         for mut group in tmp_groups { | ||||
|             // if this group does not overlap with the requested range, | ||||
|             // push it without sorting and splitting it | ||||
|             if documents_seen + group.len() < distinct_raw_offset { | ||||
|                 documents_seen += group.len(); | ||||
|                 groups.push(group); | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             let ctx = ContextMut { | ||||
|                 reader, | ||||
|                 postings_lists: &mut arena, | ||||
|                 query_enhancer: &mut query_enhancer, | ||||
|                 automatons: &mut automatons, | ||||
|                 documents_fields_counts_store, | ||||
|             }; | ||||
|  | ||||
|             let before_criterion_preparation = Instant::now(); | ||||
|             criterion.prepare(ctx, &mut group)?; | ||||
|             debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); | ||||
|  | ||||
|             let ctx = Context { | ||||
|                 postings_lists: &arena, | ||||
|                 query_enhancer: &query_enhancer, | ||||
|                 automatons: &automatons, | ||||
|             }; | ||||
|  | ||||
|             let before_criterion_sort = Instant::now(); | ||||
|             group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); | ||||
|             debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); | ||||
|  | ||||
|             for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { | ||||
|                 // we must compute the real distinguished len of this sub-group | ||||
|                 for document in group.iter() { | ||||
|                     let filter_accepted = match &filter { | ||||
|                         Some(filter) => { | ||||
|                             let entry = filter_map.entry(document.id); | ||||
|                             *entry.or_insert_with(|| (filter)(document.id)) | ||||
|                         } | ||||
|                         None => true, | ||||
|                     }; | ||||
|  | ||||
|                     if filter_accepted { | ||||
|                         let entry = key_cache.entry(document.id); | ||||
|                         let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); | ||||
|  | ||||
|                         match key.clone() { | ||||
|                             Some(key) => buf_distinct.register(key), | ||||
|                             None => buf_distinct.register_without_key(), | ||||
|                         }; | ||||
|                     } | ||||
|  | ||||
|                     // the requested range end is reached: stop computing distinct | ||||
|                     if buf_distinct.len() >= range.end { | ||||
|                         break; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 documents_seen += group.len(); | ||||
|                 groups.push(group); | ||||
|  | ||||
|                 // if this sub-group does not overlap with the requested range | ||||
|                 // we must update the distinct map and its start index | ||||
|                 if buf_distinct.len() < range.start { | ||||
|                     buf_distinct.transfert_to_internal(); | ||||
|                     distinct_raw_offset = documents_seen; | ||||
|                 } | ||||
|  | ||||
|                 // we have sort enough documents if the last document sorted is after | ||||
|                 // the end of the requested range, we can continue to the next criterion | ||||
|                 if buf_distinct.len() >= range.end { | ||||
|                     continue 'criteria; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // once we classified the documents related to the current | ||||
|     // automatons we save that as the next valid result | ||||
|     let mut seen = BufferedDistinctMap::new(&mut distinct_map); | ||||
|  | ||||
|     let mut documents = Vec::with_capacity(range.len()); | ||||
|     for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) { | ||||
|         let filter_accepted = match &filter { | ||||
|             Some(_) => filter_map.remove(&raw_document.id).unwrap(), | ||||
|             None => true, | ||||
|         }; | ||||
|  | ||||
|         if filter_accepted { | ||||
|             let key = key_cache.remove(&raw_document.id).unwrap(); | ||||
|             let distinct_accepted = match key { | ||||
|                 Some(key) => seen.register(key), | ||||
|                 None => seen.register_without_key(), | ||||
|             }; | ||||
|  | ||||
|             if distinct_accepted && seen.len() > range.start { | ||||
|                 documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref())); | ||||
|                 if documents.len() == range.len() { | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(documents) | ||||
|     unimplemented!() | ||||
| } | ||||
|  | ||||
| pub struct BareMatch<'tag> { | ||||
|   | ||||
| @@ -9,13 +9,13 @@ pub struct Attribute; | ||||
| impl Criterion for Attribute { | ||||
|     fn name(&self) -> &str { "attribute" } | ||||
|  | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, | ||||
|         documents: &mut [RawDocument<'r, 'tag>], | ||||
|     ) -> MResult<()> | ||||
|     { | ||||
|         prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); | ||||
|         prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping); | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -11,9 +11,9 @@ pub struct Exact; | ||||
| impl Criterion for Exact { | ||||
|     fn name(&self) -> &str { "exact" } | ||||
|  | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, | ||||
|         documents: &mut [RawDocument<'r, 'tag>], | ||||
|     ) -> MResult<()> | ||||
|     { | ||||
|   | ||||
| @@ -1,13 +1,16 @@ | ||||
| use std::cmp::{self, Ordering}; | ||||
| use std::collections::HashMap; | ||||
| use std::ops::Range; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use sdset::SetBuf; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::{store, RawDocument, MResult}; | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; | ||||
| use crate::database::MainT; | ||||
| use crate::query_tree::QueryId; | ||||
| use crate::{store, RawDocument, MResult}; | ||||
|  | ||||
| mod typo; | ||||
| mod words; | ||||
| @@ -30,26 +33,26 @@ pub use self::sort_by_attr::SortByAttr; | ||||
| pub trait Criterion { | ||||
|     fn name(&self) -> &str; | ||||
|  | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, | ||||
|         _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, | ||||
|         _documents: &mut [RawDocument<'r, 'tag>], | ||||
|     ) -> MResult<()> | ||||
|     { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn evaluate<'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: &Context<'p, 'tag, 'txn, 'q>, | ||||
|         lhs: &RawDocument<'r, 'tag>, | ||||
|         rhs: &RawDocument<'r, 'tag>, | ||||
|     ) -> Ordering; | ||||
|  | ||||
|     #[inline] | ||||
|     fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn eq<'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: &Context<'p, 'tag, 'txn, 'q>, | ||||
|         lhs: &RawDocument<'r, 'tag>, | ||||
|         rhs: &RawDocument<'r, 'tag>, | ||||
|     ) -> bool | ||||
| @@ -58,18 +61,16 @@ pub trait Criterion { | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> { | ||||
| pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> { | ||||
|     pub reader: &'h heed::RoTxn<MainT>, | ||||
|     pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     pub query_enhancer: &'q mut QueryEnhancer, | ||||
|     pub automatons: &'a mut [QueryWordAutomaton], | ||||
|     pub query_mapping: &'q HashMap<QueryId, Range<usize>>, | ||||
|     pub documents_fields_counts_store: store::DocumentsFieldsCounts, | ||||
| } | ||||
|  | ||||
| pub struct Context<'p, 'tag, 'txn, 'q, 'a> { | ||||
| pub struct Context<'p, 'tag, 'txn, 'q> { | ||||
|     pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     pub query_enhancer: &'q QueryEnhancer, | ||||
|     pub automatons: &'a [QueryWordAutomaton], | ||||
|     pub query_mapping: &'q HashMap<QueryId, Range<usize>>, | ||||
| } | ||||
|  | ||||
| #[derive(Default)] | ||||
| @@ -138,7 +139,7 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> { | ||||
|  | ||||
| fn prepare_query_distances<'a, 'tag, 'txn>( | ||||
|     documents: &mut [RawDocument<'a, 'tag>], | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     query_mapping: &HashMap<QueryId, Range<usize>>, | ||||
|     postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
| ) { | ||||
|     for document in documents { | ||||
| @@ -148,7 +149,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>( | ||||
|         for m in document.bare_matches.iter() { | ||||
|             if postings_lists[m.postings_list].is_empty() { continue } | ||||
|  | ||||
|             let range = query_enhancer.replacement(m.query_index as u32); | ||||
|             let range = query_mapping[&(m.query_index as usize)].clone(); | ||||
|             let new_len = cmp::max(range.end as usize, processed.len()); | ||||
|             processed.resize(new_len, None); | ||||
|  | ||||
| @@ -169,7 +170,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>( | ||||
| fn prepare_bare_matches<'a, 'tag, 'txn>( | ||||
|     documents: &mut [RawDocument<'a, 'tag>], | ||||
|     postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     query_mapping: &HashMap<QueryId, Range<usize>>, | ||||
| ) { | ||||
|     for document in documents { | ||||
|         if !document.processed_matches.is_empty() { continue } | ||||
| @@ -190,14 +191,14 @@ fn prepare_bare_matches<'a, 'tag, 'txn>( | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let processed = multiword_rewrite_matches(&mut processed, query_enhancer); | ||||
|         let processed = multiword_rewrite_matches(&mut processed, query_mapping); | ||||
|         document.processed_matches = processed.into_vec(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn multiword_rewrite_matches( | ||||
|     matches: &mut [SimpleMatch], | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     query_mapping: &HashMap<QueryId, Range<usize>>, | ||||
| ) -> SetBuf<SimpleMatch> | ||||
| { | ||||
|     matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); | ||||
| @@ -218,7 +219,7 @@ fn multiword_rewrite_matches( | ||||
|             // find the biggest padding | ||||
|             let mut biggest = 0; | ||||
|             for match_ in same_word_index { | ||||
|                 let mut replacement = query_enhancer.replacement(match_.query_index as u32); | ||||
|                 let mut replacement = query_mapping[&(match_.query_index as usize)].clone(); | ||||
|                 let replacement_len = replacement.len(); | ||||
|                 let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); | ||||
|  | ||||
| @@ -240,7 +241,7 @@ fn multiword_rewrite_matches( | ||||
|                         let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|  | ||||
|                         for nmatch_ in next_group { | ||||
|                             let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); | ||||
|                             let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone(); | ||||
|                             let query_index = rep.next().unwrap() as u16; | ||||
|                             if query_index == padmatch.query_index { | ||||
|                                 if !found { | ||||
|   | ||||
| @@ -11,13 +11,13 @@ pub struct Proximity; | ||||
| impl Criterion for Proximity { | ||||
|     fn name(&self) -> &str { "proximity" } | ||||
|  | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, | ||||
|         documents: &mut [RawDocument<'r, 'tag>], | ||||
|     ) -> MResult<()> | ||||
|     { | ||||
|         prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); | ||||
|         prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping); | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -7,13 +7,13 @@ pub struct Typo; | ||||
| impl Criterion for Typo { | ||||
|     fn name(&self) -> &str { "typo" } | ||||
|  | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, | ||||
|         documents: &mut [RawDocument<'r, 'tag>], | ||||
|     ) -> MResult<()> | ||||
|     { | ||||
|         prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); | ||||
|         prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists); | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -7,13 +7,13 @@ pub struct Words; | ||||
| impl Criterion for Words { | ||||
|     fn name(&self) -> &str { "words" } | ||||
|  | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, | ||||
|         documents: &mut [RawDocument<'r, 'tag>], | ||||
|     ) -> MResult<()> | ||||
|     { | ||||
|         prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); | ||||
|         prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists); | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -9,13 +9,13 @@ pub struct WordsPosition; | ||||
| impl Criterion for WordsPosition { | ||||
|     fn name(&self) -> &str { "words position" } | ||||
|  | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( | ||||
|     fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>( | ||||
|         &self, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, | ||||
|         ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>, | ||||
|         documents: &mut [RawDocument<'r, 'tag>], | ||||
|     ) -> MResult<()> | ||||
|     { | ||||
|         prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); | ||||
|         prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping); | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -97,17 +97,19 @@ impl Document { | ||||
|     #[cfg(not(test))] | ||||
|     pub fn from_raw<'a, 'tag, 'txn>( | ||||
|         raw_document: RawDocument<'a, 'tag>, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|         // automatons: &[QueryWordAutomaton], | ||||
|         arena: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         searchable_attrs: Option<&ReorderedAttrs>, | ||||
|     ) -> Document | ||||
|     { | ||||
|         let highlights = highlights_from_raw_document( | ||||
|             &raw_document, | ||||
|             automatons, | ||||
|             arena, | ||||
|             searchable_attrs, | ||||
|         ); | ||||
|         // let highlights = highlights_from_raw_document( | ||||
|         //     &raw_document, | ||||
|         //     automatons, | ||||
|         //     arena, | ||||
|         //     searchable_attrs, | ||||
|         // ); | ||||
|  | ||||
|         let highlights = Vec::new(); | ||||
|  | ||||
|         Document { id: raw_document.id, highlights } | ||||
|     } | ||||
| @@ -115,19 +117,21 @@ impl Document { | ||||
|     #[cfg(test)] | ||||
|     pub fn from_raw<'a, 'tag, 'txn>( | ||||
|         raw_document: RawDocument<'a, 'tag>, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|         // automatons: &[QueryWordAutomaton], | ||||
|         arena: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         searchable_attrs: Option<&ReorderedAttrs>, | ||||
|     ) -> Document | ||||
|     { | ||||
|         use crate::bucket_sort::SimpleMatch; | ||||
|  | ||||
|         let highlights = highlights_from_raw_document( | ||||
|             &raw_document, | ||||
|             automatons, | ||||
|             arena, | ||||
|             searchable_attrs, | ||||
|         ); | ||||
|         // let highlights = highlights_from_raw_document( | ||||
|         //     &raw_document, | ||||
|         //     automatons, | ||||
|         //     arena, | ||||
|         //     searchable_attrs, | ||||
|         // ); | ||||
|  | ||||
|         let highlights = Vec::new(); | ||||
|  | ||||
|         let mut matches = Vec::new(); | ||||
|         for sm in raw_document.processed_matches { | ||||
|   | ||||
| @@ -1,5 +1,4 @@ | ||||
| use compact_arena::SmallArena; | ||||
| use itertools::EitherOrBoth; | ||||
| use sdset::SetBuf; | ||||
| use crate::DocIndex; | ||||
| use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; | ||||
| @@ -19,10 +18,9 @@ pub struct RawDocument<'a, 'tag> { | ||||
| impl<'a, 'tag> RawDocument<'a, 'tag> { | ||||
|     pub fn new<'txn>( | ||||
|         bare_matches: &'a mut [BareMatch<'tag>], | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         searchable_attrs: Option<&ReorderedAttrs>, | ||||
|     ) -> Option<RawDocument<'a, 'tag>> | ||||
|     ) -> RawDocument<'a, 'tag> | ||||
|     { | ||||
|         if let Some(reordered_attrs) = searchable_attrs { | ||||
|             for bm in bare_matches.iter() { | ||||
| @@ -42,70 +40,12 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { | ||||
|  | ||||
|         bare_matches.sort_unstable_by_key(|m| m.query_index); | ||||
|  | ||||
|         let mut previous_word = None; | ||||
|         for i in 0..bare_matches.len() { | ||||
|             let a = &bare_matches[i]; | ||||
|             let auta = &automatons[a.query_index as usize]; | ||||
|  | ||||
|             match auta.phrase_query { | ||||
|                 Some((0, _)) => { | ||||
|                     let b = match bare_matches.get(i + 1) { | ||||
|                         Some(b) => b, | ||||
|                         None => { | ||||
|                             postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                             continue; | ||||
|                         } | ||||
|                     }; | ||||
|  | ||||
|                     if a.query_index + 1 != b.query_index { | ||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                         continue | ||||
|                     } | ||||
|  | ||||
|                     let pla = &postings_lists[a.postings_list]; | ||||
|                     let plb = &postings_lists[b.postings_list]; | ||||
|  | ||||
|                     let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { | ||||
|                         a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) | ||||
|                     }); | ||||
|  | ||||
|                     let mut newa = Vec::new(); | ||||
|                     let mut newb = Vec::new(); | ||||
|  | ||||
|                     for eb in iter { | ||||
|                         if let EitherOrBoth::Both(a, b) = eb { | ||||
|                             newa.push(*a); | ||||
|                             newb.push(*b); | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     if !newa.is_empty() { | ||||
|                         previous_word = Some(a.query_index); | ||||
|                     } | ||||
|  | ||||
|                     postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); | ||||
|                     postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); | ||||
|                 }, | ||||
|                 Some((1, _)) => { | ||||
|                     if previous_word.take() != Some(a.query_index - 1) { | ||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                     } | ||||
|                 }, | ||||
|                 Some((_, _)) => unreachable!(), | ||||
|                 None => (), | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { | ||||
|             return None | ||||
|         } | ||||
|  | ||||
|         Some(RawDocument { | ||||
|         RawDocument { | ||||
|             id: bare_matches[0].document_id, | ||||
|             bare_matches, | ||||
|             processed_matches: Vec::new(), | ||||
|             processed_distances: Vec::new(), | ||||
|             contains_one_word_field: false, | ||||
|         }) | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user