Make all search tests pass, fix distinctAttribute bug

This commit is contained in:
Loïc Lecrenier
2023-04-24 12:11:25 +02:00
parent a7a0891210
commit d1fdbb63da
17 changed files with 465 additions and 327 deletions

View File

@@ -28,7 +28,6 @@ pub struct Search<'a> {
limit: usize,
sort_criteria: Option<Vec<AscDesc>>,
terms_matching_strategy: TermsMatchingStrategy,
authorize_typos: bool,
words_limit: usize,
exhaustive_number_hits: bool,
rtxn: &'a heed::RoTxn<'a>,
@@ -44,7 +43,6 @@ impl<'a> Search<'a> {
limit: 20,
sort_criteria: None,
terms_matching_strategy: TermsMatchingStrategy::default(),
authorize_typos: true,
exhaustive_number_hits: false,
words_limit: 10,
rtxn,
@@ -77,11 +75,6 @@ impl<'a> Search<'a> {
self
}
pub fn authorize_typos(&mut self, value: bool) -> &mut Search<'a> {
self.authorize_typos = value;
self
}
pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> {
self.words_limit = value;
self
@@ -99,13 +92,6 @@ impl<'a> Search<'a> {
self
}
// TODO!
fn _is_typo_authorized(&self) -> Result<bool> {
let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?;
// only authorize typos if both the index and the query allow it.
Ok(self.authorize_typos && index_authorizes_typos)
}
pub fn execute(&self) -> Result<SearchResult> {
let mut ctx = SearchContext::new(self.index, self.rtxn);
let PartialSearchResult { located_query_terms, candidates, documents_ids } =
@@ -142,7 +128,6 @@ impl fmt::Debug for Search<'_> {
limit,
sort_criteria,
terms_matching_strategy,
authorize_typos,
words_limit,
exhaustive_number_hits,
rtxn: _,
@@ -155,7 +140,6 @@ impl fmt::Debug for Search<'_> {
.field("limit", limit)
.field("sort_criteria", sort_criteria)
.field("terms_matching_strategy", terms_matching_strategy)
.field("authorize_typos", authorize_typos)
.field("exhaustive_number_hits", exhaustive_number_hits)
.field("words_limit", words_limit)
.finish()
@@ -231,92 +215,4 @@ mod test {
assert_eq!(documents_ids, vec![1]);
}
// #[test]
// fn test_is_authorized_typos() {
// let index = TempIndex::new();
// let mut txn = index.write_txn().unwrap();
// let mut search = Search::new(&txn, &index);
// // default is authorized
// assert!(search.is_typo_authorized().unwrap());
// search.authorize_typos(false);
// assert!(!search.is_typo_authorized().unwrap());
// index.put_authorize_typos(&mut txn, false).unwrap();
// txn.commit().unwrap();
// let txn = index.read_txn().unwrap();
// let mut search = Search::new(&txn, &index);
// assert!(!search.is_typo_authorized().unwrap());
// search.authorize_typos(true);
// assert!(!search.is_typo_authorized().unwrap());
// }
// #[test]
// fn test_one_typos_tolerance() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap();
// assert_eq!(found, &[("zealand".to_string(), 1)]);
// }
// #[test]
// fn test_one_typos_first_letter() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap();
// assert_eq!(found, &[]);
// }
// #[test]
// fn test_two_typos_tolerance() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap();
// assert_eq!(found, &[("zealand".to_string(), 2)]);
// }
// #[test]
// fn test_two_typos_first_letter() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap();
// assert_eq!(found, &[("zealand".to_string(), 2)]);
// }
// #[test]
// fn test_prefix() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap();
// assert_eq!(found, &[("zealand".to_string(), 0)]);
// }
// #[test]
// fn test_bad_prefix() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap();
// assert_eq!(found, &[]);
// }
// #[test]
// fn test_prefix_with_typo() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap();
// assert_eq!(found, &[("zealand".to_string(), 1)]);
// }
}

View File

@@ -88,7 +88,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
};
}
let mut all_candidates = RoaringBitmap::new();
let mut all_candidates = universe.clone();
let mut valid_docids = vec![];
let mut cur_offset = 0usize;
@@ -162,8 +162,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
)?;
}
all_candidates |= &ranking_rule_universes[0];
Ok(BucketSortOutput { docids: valid_docids, all_candidates })
}
@@ -193,12 +191,14 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
apply_distinct_rule(ctx, distinct_fid, &candidates)?;
for universe in ranking_rule_universes.iter_mut() {
*universe -= &excluded;
*all_candidates -= &excluded;
}
remaining
} else {
candidates.clone()
};
*all_candidates |= &candidates;
// if the candidates are empty, there is nothing to do;
if candidates.is_empty() {
return Ok(());
@@ -216,8 +216,8 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
);
} else {
// otherwise, skip some of the documents and add some of the rest, in order of ids
let all_candidates = candidates.iter().collect::<Vec<_>>();
let (skipped_candidates, candidates) = all_candidates.split_at(from - *cur_offset);
let candidates_vec = candidates.iter().collect::<Vec<_>>();
let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset);
logger.skip_bucket_ranking_rule(
cur_ranking_rule_index,

View File

@@ -243,7 +243,7 @@ pub(crate) mod tests {
let temp_index = TempIndex::new();
temp_index
.add_documents(documents!([
{ "id": 1, "name": "split this world westfali westfalia the" },
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
]))
.unwrap();
temp_index
@@ -305,7 +305,7 @@ pub(crate) mod tests {
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
None
);
assert_eq!(
matching_words

View File

@@ -599,7 +599,7 @@ mod tests {
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Ŵôřlḑ</em>ôle"
@"<em>Ŵôřlḑôle</em>"
);
// Text containing unicode match.
@@ -621,7 +621,7 @@ mod tests {
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Westfáli</em>a"
@"<em>Westfália</em>"
);
}

View File

@@ -184,11 +184,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
for rr in settings_ranking_rules {
// Add Words before any of: typo, proximity, attribute, exactness
match rr {
crate::Criterion::Typo
| crate::Criterion::Attribute
| crate::Criterion::Proximity
// TODO: no exactness
| crate::Criterion::Exactness => {
crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => {
if !words {
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
words = true;
@@ -339,6 +335,8 @@ pub fn execute_search(
check_sort_criteria(ctx, sort_criteria.as_ref())?;
// TODO: if the exactness criterion is the first one, then
// use a different strategy to find the universe (union of any term)
universe = resolve_maximally_reduced_query_graph(
ctx,
&universe,

View File

@@ -56,8 +56,13 @@ impl RankingRuleGraphTrait for PositionGraph {
}
for phrase in term.term_subset.all_phrases(ctx)? {
for &word in phrase.words(ctx).iter().flatten() {
let positions = ctx.get_db_word_positions(word)?;
// Only check the position of the first word in the phrase
// this is not correct, but it is the best we can do, since
// it is difficult/impossible to know the expected position
// of a word in a phrase.
// There is probably a more correct way to do it though.
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
let positions = ctx.get_db_word_positions(*word)?;
all_positions.extend(positions);
}
}

View File

@@ -79,11 +79,6 @@ pub fn compute_docids(
//
// This is an optimisation to avoid checking for an excessive number of
// pairs.
// WAIT, NO.
// This should only be done once per node.
// Here, we'll potentially do is.. 16 times?
// Maybe we should do it at edge-build time instead.
// Same for the future attribute ranking rule.
let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
if right_derivs.len() > 1 {
let universe = &universe;
@@ -190,11 +185,6 @@ fn compute_non_prefix_edges(
docids: &mut RoaringBitmap,
universe: &RoaringBitmap,
) -> Result<()> {
let mut used_left_phrases = BTreeSet::new();
let mut used_right_phrases = BTreeSet::new();
let mut used_left_words = BTreeSet::new();
let mut used_right_words = BTreeSet::new();
let mut universe = universe.clone();
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
@@ -204,25 +194,19 @@ fn compute_non_prefix_edges(
return Ok(());
}
}
if let Some(left_phrase) = left_phrase {
used_left_phrases.insert(left_phrase);
}
if let Some(right_phrase) = right_phrase {
used_right_phrases.insert(right_phrase);
}
if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
{
let new_docids = &universe & new_docids;
if !new_docids.is_empty() {
used_left_words.insert(word1);
used_right_words.insert(word2);
*docids |= new_docids;
}
}
if backward_proximity >= 1
// no swapping when either term is a phrase
// TODO: for now, we don't do any swapping when either term is a phrase
// but maybe we should. We'd need to look at the first/last word of the phrase
// depending on the context.
&& left_phrase.is_none() && right_phrase.is_none()
{
if let Some(new_docids) =
@@ -230,8 +214,6 @@ fn compute_non_prefix_edges(
{
let new_docids = &universe & new_docids;
if !new_docids.is_empty() {
used_left_words.insert(word2);
used_right_words.insert(word1);
*docids |= new_docids;
}
}

View File

@@ -69,11 +69,16 @@ pub fn compute_query_term_subset_docids_within_field_id(
}
for phrase in term.all_phrases(ctx)? {
for &word in phrase.words(ctx).iter().flatten() {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? {
docids |= word_fid_docids;
let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
// There may be false positives when resolving a phrase, so we're not
// guaranteed that all of its words are within a single fid.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
phrase_docids &= word_fid_docids;
}
}
docids |= phrase_docids;
}
if let Some(word_prefix) = term.use_prefix_db(ctx) {
@@ -104,11 +109,16 @@ pub fn compute_query_term_subset_docids_within_position(
}
for phrase in term.all_phrases(ctx)? {
for &word in phrase.words(ctx).iter().flatten() {
if let Some(word_position_docids) = ctx.get_db_word_position_docids(word, position)? {
docids |= word_position_docids;
let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
// It's difficult to know the expected position of the words in the phrase,
// so instead we just check the first one.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
phrase_docids &= word_position_docids;
}
}
docids |= phrase_docids;
}
if let Some(word_prefix) = term.use_prefix_db(ctx) {

View File

@@ -1229,7 +1229,6 @@ mod tests {
// testing the simple query search
let mut search = crate::Search::new(&rtxn, &index);
search.query("document");
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
// all documents should be returned
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
@@ -1335,7 +1334,6 @@ mod tests {
// testing the simple query search
let mut search = crate::Search::new(&rtxn, &index);
search.query("document");
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
// all documents should be returned
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
@@ -1582,7 +1580,6 @@ mod tests {
let mut search = crate::Search::new(&rtxn, &index);
search.query("化妆包");
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
// only 1 document should be returned

View File

@@ -1,17 +1,384 @@
{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"etiopia","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null], "tag_in": 1}
{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"fehérorosz","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": [], "tag_in": 2}
{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"etiopia","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null, "tag_in": 3}
{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"fehérorosz","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4, "tag_in": "four"}
{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"észak-korea","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E", "tag_in": "five"}
{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"etiopia","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"], "tag_in": null}
{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"fehérorosz","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]}
{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"észak-korea","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8], "tag_in": 8}
{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"etiopia","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":"", "tag_in": "nine"}
{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"észak-korea","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}, "tag_in": 10}
{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"fehérorosz","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] , "tag_in": "eleven"}
{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"etiopia","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}, "tag_in": 12}
{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"fehérorosz","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]}
{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"észak-korea","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}}}
{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"etiopia","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]]}
{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"fehérorosz","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16}
{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"észak-korea","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""}
{
"id": "A",
"word_rank": 0,
"typo_rank": 2,
"proximity_rank": 16,
"attribute_rank": 224,
"exact_rank": 6,
"asc_desc_rank": 0,
"sort_by_rank": 0,
"geo_rank": 43,
"title": "hell o",
"description": "hell o is the fourteenth episode of the american television series glee performing songs with this word",
"tag": "etiopia",
"_geo": {
"lat": 50.62984446145472,
"lng": 3.085712705162039
},
"": "",
"opt1": [
null
],
"tag_in": 1
}
{
"id": "B",
"word_rank": 2,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 0,
"exact_rank": 0,
"asc_desc_rank": 1,
"sort_by_rank": 2,
"geo_rank": 191,
"title": "hello",
"description": "hello is a song recorded by english singer songwriter adele",
"tag": "fehérorosz",
"_geo": {
"lat": 50.63047567664291,
"lng": 3.088852230809636
},
"": "",
"opt1": [],
"tag_in": 2
}
{
"id": "C",
"word_rank": 0,
"typo_rank": 1,
"proximity_rank": 10,
"attribute_rank": 111,
"exact_rank": 6,
"asc_desc_rank": 2,
"sort_by_rank": 0,
"geo_rank": 283,
"title": "hell on earth",
"description": "hell on earth is the third studio album by american hip hop duo mobb deep",
"tag": "etiopia",
"_geo": {
"lat": 50.6321800003937,
"lng": 3.088331882262139
},
"": "",
"opt1": null,
"tag_in": 3
}
{
"id": "D",
"word_rank": 0,
"typo_rank": 1,
"proximity_rank": 16,
"attribute_rank": 213,
"exact_rank": 5,
"asc_desc_rank": 3,
"sort_by_rank": 2,
"geo_rank": 1381,
"title": "hell on wheels tv series",
"description": "the construction of the first transcontinental railroad across the united states in the world",
"tag": "fehérorosz",
"_geo": {
"lat": 50.63728851135729,
"lng": 3.0703951595971626
},
"": "",
"opt1": 4,
"tag_in": "four"
}
{
"id": "E",
"word_rank": 2,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 0,
"exact_rank": 1,
"asc_desc_rank": 4,
"sort_by_rank": 1,
"geo_rank": 1979,
"title": "hello kitty",
"description": "also known by her full name kitty white is a fictional character produced by the japanese company sanrio",
"tag": "észak-korea",
"_geo": {
"lat": 50.64264610511925,
"lng": 3.0665099941857634
},
"": "",
"opt1": "E",
"tag_in": "five"
}
{
"id": "F",
"word_rank": 2,
"typo_rank": 1,
"proximity_rank": 0,
"attribute_rank": 116,
"exact_rank": 5,
"asc_desc_rank": 5,
"sort_by_rank": 0,
"geo_rank": 65022,
"title": "laptop orchestra",
"description": "a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra",
"tag": "etiopia",
"_geo": {
"lat": 51.05028653642387,
"lng": 3.7301072771642096
},
"": "",
"opt1": [
"F"
],
"tag_in": null
}
{
"id": "G",
"word_rank": 1,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 1,
"exact_rank": 3,
"asc_desc_rank": 5,
"sort_by_rank": 2,
"geo_rank": 34692,
"title": "hello world film",
"description": "hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica",
"tag": "fehérorosz",
"_geo": {
"lat": 50.78776041427129,
"lng": 2.661201766290338
},
"": "",
"opt1": [
7
]
}
{
"id": "H",
"word_rank": 1,
"typo_rank": 0,
"proximity_rank": 1,
"attribute_rank": 1,
"exact_rank": 3,
"asc_desc_rank": 4,
"sort_by_rank": 1,
"geo_rank": 202182,
"title": "world hello day",
"description": "holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force",
"tag": "észak-korea",
"_geo": {
"lat": 48.875617484531965,
"lng": 2.346747821504194
},
"": "",
"opt1": [
"H",
8
],
"tag_in": 8
}
{
"id": "I",
"word_rank": 0,
"typo_rank": 0,
"proximity_rank": 9,
"attribute_rank": 125,
"exact_rank": 3,
"asc_desc_rank": 3,
"sort_by_rank": 0,
"geo_rank": 740667,
"title": "hello world song",
"description": "hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum",
"tag": "etiopia",
"_geo": {
"lat": 43.973998070351065,
"lng": 3.4661837318345032
},
"": "",
"tag_in": "nine"
}
{
"id": "J",
"word_rank": 1,
"typo_rank": 0,
"proximity_rank": 1,
"attribute_rank": 2,
"exact_rank": 3,
"asc_desc_rank": 2,
"sort_by_rank": 1,
"geo_rank": 739020,
"title": "hello cruel world",
"description": "hello cruel world is an album by new zealand band tall dwarfs",
"tag": "észak-korea",
"_geo": {
"lat": 43.98920130353838,
"lng": 3.480519311627928
},
"": "",
"opt1": {},
"tag_in": 10
}
{
"id": "K",
"word_rank": 0,
"typo_rank": 2,
"proximity_rank": 10,
"attribute_rank": 213,
"exact_rank": 6,
"asc_desc_rank": 1,
"sort_by_rank": 2,
"geo_rank": 738830,
"title": "hallo creation system",
"description": "in few word hallo was a construction toy created by the american company mattel to engage girls in construction play",
"tag": "fehérorosz",
"_geo": {
"lat": 43.99155030238669,
"lng": 3.503453528249425
},
"": "",
"opt1": [
{
"opt2": 11
}
],
"tag_in": "eleven"
}
{
"id": "L",
"word_rank": 0,
"typo_rank": 0,
"proximity_rank": 2,
"attribute_rank": 107,
"exact_rank": 5,
"asc_desc_rank": 0,
"sort_by_rank": 0,
"geo_rank": 737861,
"title": "good morning world",
"description": "good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season",
"tag": "etiopia",
"_geo": {
"lat": 44.000507750283695,
"lng": 3.5116812040621572
},
"": "",
"opt1": {
"opt2": [
12
]
},
"tag_in": 12
}
{
"id": "M",
"word_rank": 0,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 3,
"exact_rank": 0,
"asc_desc_rank": 0,
"sort_by_rank": 2,
"geo_rank": 739203,
"title": "hello world america",
"description": "a perfect match for a perfect engine using the query hello world america",
"tag": "fehérorosz",
"_geo": {
"lat": 43.99150729038736,
"lng": 3.606143957295055
},
"": "",
"opt1": [
13,
[
{
"opt2": null
}
]
]
}
{
"id": "N",
"word_rank": 0,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 3,
"exact_rank": 1,
"asc_desc_rank": 4,
"sort_by_rank": 1,
"geo_rank": 9499586,
"title": "hello world america unleashed",
"description": "a very good match for a very good engine using the query hello world america",
"tag": "észak-korea",
"_geo": {
"lat": 35.511540843367115,
"lng": 138.764368875787
},
"": "",
"opt1": {
"a": 1,
"opt2": {
"opt3": 14
}
}
}
{
"id": "O",
"word_rank": 0,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 3,
"exact_rank": 0,
"asc_desc_rank": 6,
"sort_by_rank": 0,
"geo_rank": 9425163,
"title": "a perfect match for a perfect engine using the query hello world america",
"description": "hello world america",
"tag": "etiopia",
"_geo": {
"lat": 35.00536702277189,
"lng": 135.76118763940391
},
"": "",
"opt1": [
[
[
[]
]
]
]
}
{
"id": "P",
"word_rank": 0,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 3,
"exact_rank": 1,
"asc_desc_rank": 3,
"sort_by_rank": 2,
"geo_rank": 9422437,
"title": "a very good match for a very good engine using the query hello world america",
"description": "hello world america unleashed",
"tag": "fehérorosz",
"_geo": {
"lat": 35.06462306367058,
"lng": 135.8338440354251
},
"": "",
"opt1.opt2": 16
}
{
"id": "Q",
"word_rank": 1,
"typo_rank": 0,
"proximity_rank": 0,
"attribute_rank": 1,
"exact_rank": 2,
"asc_desc_rank": 2,
"sort_by_rank": 1,
"geo_rank": 9339230,
"title": "hello world",
"description": "a hello world program generally is a computer program that outputs or displays the message hello world",
"tag": "észak-korea",
"_geo": {
"lat": 34.39548365683149,
"lng": 132.4535960928883
},
"": ""
}

View File

@@ -28,7 +28,7 @@ macro_rules! test_distinct {
search.query(search::TEST_QUERY);
search.limit($limit);
search.exhaustive_number_hits($exhaustive);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap();
@@ -37,7 +37,7 @@ macro_rules! test_distinct {
let mut distinct_values = HashSet::new();
let expected_external_ids: Vec<_> =
search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[])
search::expected_order(&criteria, TermsMatchingStrategy::default(), &[])
.into_iter()
.filter_map(|d| {
if distinct_values.contains(&d.$distinct) {

View File

@@ -18,7 +18,7 @@ macro_rules! test_filter {
let mut search = Search::new(&rtxn, &index);
search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
search.filter(filter_conditions);
@@ -26,7 +26,7 @@ macro_rules! test_filter {
let filtered_ids = search::expected_filtered_ids($filter);
let expected_external_ids: Vec<_> =
search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[])
search::expected_order(&criteria, TermsMatchingStrategy::default(), &[])
.into_iter()
.filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None })
.collect();

View File

@@ -61,7 +61,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
// index documents
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let indexing_config = IndexDocumentsConfig::default();
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
@@ -96,7 +96,6 @@ pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> V
pub fn expected_order(
criteria: &[Criterion],
authorize_typo: bool,
optional_words: TermsMatchingStrategy,
sort_by: &[AscDesc],
) -> Vec<TestDocument> {
@@ -156,14 +155,11 @@ pub fn expected_order(
groups = std::mem::take(&mut new_groups);
}
if authorize_typo && optional_words == TermsMatchingStrategy::default() {
groups.into_iter().flatten().collect()
} else if optional_words == TermsMatchingStrategy::default() {
groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect()
} else if authorize_typo {
match optional_words {
TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(),
TermsMatchingStrategy::All => {
groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect()
} else {
groups.into_iter().flatten().filter(|d| d.word_rank == 0 && d.typo_rank == 0).collect()
}
}
}

View File

@@ -26,7 +26,6 @@ fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) {
let mut search = Search::new(&txn, &index);
search.query("\"the use of force\"");
search.limit(10);
search.authorize_typos(false);
search.terms_matching_strategy(TermsMatchingStrategy::All);
let result = search.execute().unwrap();

View File

@@ -13,14 +13,12 @@ use Criterion::*;
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
const ALLOW_TYPOS: bool = true;
const DISALLOW_TYPOS: bool = false;
const ALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::Last;
const DISALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::All;
const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000;
macro_rules! test_criterion {
($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => {
($func:ident, $optional_word:ident, $criteria:expr, $sort_criteria:expr) => {
#[test]
fn $func() {
let criteria = $criteria;
@@ -30,18 +28,13 @@ macro_rules! test_criterion {
let mut search = Search::new(&rtxn, &index);
search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.authorize_typos($authorize_typos);
search.terms_matching_strategy($optional_word);
search.sort_criteria($sort_criteria);
let SearchResult { documents_ids, .. } = search.execute().unwrap();
let expected_external_ids: Vec<_> = search::expected_order(
&criteria,
$authorize_typos,
$optional_word,
&$sort_criteria[..],
)
let expected_external_ids: Vec<_> =
search::expected_order(&criteria, $optional_word, &$sort_criteria[..])
.into_iter()
.map(|d| d.id)
.collect();
@@ -51,148 +44,44 @@ macro_rules! test_criterion {
};
}
test_criterion!(none_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]);
test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]);
test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]);
test_criterion!(none, DISALLOW_OPTIONAL_WORDS, vec![], vec![]);
test_criterion!(words, ALLOW_OPTIONAL_WORDS, vec![Words], vec![]);
test_criterion!(attribute, DISALLOW_OPTIONAL_WORDS, vec![Attribute], vec![]);
test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, vec![Typo], vec![]);
test_criterion!(exactness, DISALLOW_OPTIONAL_WORDS, vec![Exactness], vec![]);
test_criterion!(proximity, DISALLOW_OPTIONAL_WORDS, vec![Proximity], vec![]);
test_criterion!(asc, DISALLOW_OPTIONAL_WORDS, vec![Asc(S("asc_desc_rank"))], vec![]);
test_criterion!(desc, DISALLOW_OPTIONAL_WORDS, vec![Desc(S("asc_desc_rank"))], vec![]);
test_criterion!(
attribute_allow_typo,
asc_unexisting_field,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Attribute],
vec![]
);
test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Typo], vec![]);
test_criterion!(
attribute_disallow_typo,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Attribute],
vec![]
);
test_criterion!(
exactness_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Exactness],
vec![]
);
test_criterion!(
exactness_disallow_typo,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Exactness],
vec![]
);
test_criterion!(
proximity_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Proximity],
vec![]
);
test_criterion!(
proximity_disallow_typo,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Proximity],
vec![]
);
test_criterion!(
asc_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Asc(S("asc_desc_rank"))],
vec![]
);
test_criterion!(
asc_disallow_typo,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Asc(S("asc_desc_rank"))],
vec![]
);
test_criterion!(
desc_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Desc(S("asc_desc_rank"))],
vec![]
);
test_criterion!(
desc_disallow_typo,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Desc(S("asc_desc_rank"))],
vec![]
);
test_criterion!(
asc_unexisting_field_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Asc(S("unexisting_field"))],
vec![]
);
test_criterion!(
asc_unexisting_field_disallow_typo,
desc_unexisting_field,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Asc(S("unexisting_field"))],
vec![]
);
test_criterion!(
desc_unexisting_field_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Desc(S("unexisting_field"))],
vec![]
);
test_criterion!(empty_sort_by, DISALLOW_OPTIONAL_WORDS, vec![Sort], vec![]);
test_criterion!(
desc_unexisting_field_disallow_typo,
sort_by_asc,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Desc(S("unexisting_field"))],
vec![]
);
test_criterion!(empty_sort_by_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], vec![]);
test_criterion!(
empty_sort_by_disallow_typo,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Sort],
vec![]
);
test_criterion!(
sort_by_asc_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Sort],
vec![AscDesc::Asc(Member::Field(S("tag")))]
);
test_criterion!(
sort_by_asc_disallow_typo,
sort_by_desc,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Sort],
vec![AscDesc::Asc(Member::Field(S("tag")))]
);
test_criterion!(
sort_by_desc_allow_typo,
DISALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Sort],
vec![AscDesc::Desc(Member::Field(S("tag")))]
);
test_criterion!(
sort_by_desc_disallow_typo,
DISALLOW_OPTIONAL_WORDS,
DISALLOW_TYPOS,
vec![Sort],
vec![AscDesc::Desc(Member::Field(S("tag")))]
);
test_criterion!(
default_criteria_order,
ALLOW_OPTIONAL_WORDS,
ALLOW_TYPOS,
vec![Words, Typo, Proximity, Attribute, Exactness],
vec![]
);
@@ -354,12 +243,11 @@ fn criteria_mixup() {
search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.terms_matching_strategy(ALLOW_OPTIONAL_WORDS);
search.authorize_typos(ALLOW_TYPOS);
let SearchResult { documents_ids, .. } = search.execute().unwrap();
let expected_external_ids: Vec<_> =
search::expected_order(&criteria, ALLOW_TYPOS, ALLOW_OPTIONAL_WORDS, &[])
search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, &[])
.into_iter()
.map(|d| d.id)
.collect();

View File

@@ -14,7 +14,7 @@ fn sort_ranking_rule_missing() {
let mut search = Search::new(&rtxn, &index);
search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]);

View File

@@ -19,7 +19,7 @@ fn test_typo_tolerance_one_typo() {
let mut search = Search::new(&txn, &index);
search.query("zeal");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -28,7 +28,7 @@ fn test_typo_tolerance_one_typo() {
let mut search = Search::new(&txn, &index);
search.query("zean");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -46,7 +46,7 @@ fn test_typo_tolerance_one_typo() {
let mut search = Search::new(&txn, &index);
search.query("zean");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -65,7 +65,7 @@ fn test_typo_tolerance_two_typo() {
let mut search = Search::new(&txn, &index);
search.query("zealand");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -74,7 +74,7 @@ fn test_typo_tolerance_two_typo() {
let mut search = Search::new(&txn, &index);
search.query("zealemd");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -92,7 +92,7 @@ fn test_typo_tolerance_two_typo() {
let mut search = Search::new(&txn, &index);
search.query("zealemd");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -142,7 +142,7 @@ fn test_typo_disabled_on_word() {
let mut search = Search::new(&txn, &index);
search.query("zealand");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -162,7 +162,7 @@ fn test_typo_disabled_on_word() {
let mut search = Search::new(&txn, &index);
search.query("zealand");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -182,7 +182,7 @@ fn test_disable_typo_on_attribute() {
// typo in `antebel(l)um`
search.query("antebelum");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();
@@ -200,7 +200,7 @@ fn test_disable_typo_on_attribute() {
let mut search = Search::new(&txn, &index);
search.query("antebelum");
search.limit(10);
search.authorize_typos(true);
search.terms_matching_strategy(TermsMatchingStrategy::default());
let result = search.execute().unwrap();